diff options
Diffstat (limited to 'pkg/unet/unet_unsafe.go')
-rw-r--r-- | pkg/unet/unet_unsafe.go | 289 |
1 files changed, 289 insertions, 0 deletions
diff --git a/pkg/unet/unet_unsafe.go b/pkg/unet/unet_unsafe.go new file mode 100644 index 000000000..fa0916439 --- /dev/null +++ b/pkg/unet/unet_unsafe.go @@ -0,0 +1,289 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unet + +import ( + "io" + "math" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +// wait blocks until the socket FD is ready for reading or writing, depending +// on the value of write. +// +// Returns errClosing if the Socket is in the process of closing. +func (s *Socket) wait(write bool) error { + for { + // Checking the FD on each loop is not strictly necessary, it + // just avoids an extra poll call. + fd := atomic.LoadInt32(&s.fd) + if fd < 0 { + return errClosing + } + + events := []linux.PollFD{ + { + // The actual socket FD. + FD: fd, + Events: linux.POLLIN, + }, + { + // The eventfd, signaled when we are closing. + FD: int32(s.efd), + Events: linux.POLLIN, + }, + } + if write { + events[0].Events = linux.POLLOUT + } + + _, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(&events[0])), 2, uintptr(math.MaxUint64)) + if e == syscall.EINTR { + continue + } + if e != 0 { + return e + } + + if events[1].REvents&linux.POLLIN == linux.POLLIN { + // eventfd signaled, we're closing. + return errClosing + } + + return nil + } +} + +// buildIovec builds an iovec slice from the given []byte slice. +// +// iovecs is used as an initial slice, to avoid excessive allocations. +func buildIovec(bufs [][]byte, iovecs []syscall.Iovec) ([]syscall.Iovec, int) { + var length int + for i := range bufs { + if l := len(bufs[i]); l > 0 { + iovecs = append(iovecs, syscall.Iovec{ + Base: &bufs[i][0], + Len: uint64(l), + }) + length += l + } + } + return iovecs, length +} + +// ReadVec reads into the pre-allocated bufs. Returns bytes read. +// +// The pre-allocatted space used by ReadVec is based upon slice lengths. +// +// This function is not guaranteed to read all available data, it +// returns as soon as a single recvmsg call succeeds. +func (r *SocketReader) ReadVec(bufs [][]byte) (int, error) { + iovecs, length := buildIovec(bufs, make([]syscall.Iovec, 0, 2)) + + var msg syscall.Msghdr + if len(r.source) != 0 { + msg.Name = &r.source[0] + msg.Namelen = uint32(len(r.source)) + } + + if len(r.ControlMessage) != 0 { + msg.Control = &r.ControlMessage[0] + msg.Controllen = uint64(len(r.ControlMessage)) + } + + if len(iovecs) != 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + // n is the bytes received. + var n uintptr + + fd, ok := r.socket.enterFD() + if !ok { + return 0, syscall.EBADF + } + // Leave on returns below. + for { + var e syscall.Errno + + // Try a non-blocking recv first, so we don't give up the go runtime M. + n, _, e = syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_TRUNC) + if e == 0 { + break + } + if e == syscall.EINTR { + continue + } + if !r.blocking { + r.socket.gate.Leave() + return 0, e + } + if e != syscall.EAGAIN && e != syscall.EWOULDBLOCK { + r.socket.gate.Leave() + return 0, e + } + + // Wait for the socket to become readable. + err := r.socket.wait(false) + if err == errClosing { + err = syscall.EBADF + } + if err != nil { + r.socket.gate.Leave() + return 0, err + } + } + + r.socket.gate.Leave() + + if msg.Controllen < uint64(len(r.ControlMessage)) { + r.ControlMessage = r.ControlMessage[:msg.Controllen] + } + + if msg.Namelen < uint32(len(r.source)) { + r.source = r.source[:msg.Namelen] + } + + // All unet sockets are SOCK_STREAM or SOCK_SEQPACKET, both of which + // indicate that the other end is closed by returning a 0 length read + // with no error. + if n == 0 { + return 0, io.EOF + } + + if r.race != nil { + // See comments on Socket.race. + atomic.AddInt32(r.race, 1) + } + + if int(n) > length { + return length, errMessageTruncated + } + + return int(n), nil +} + +// WriteVec writes the bufs to the socket. Returns bytes written. +// +// This function is not guaranteed to send all data, it returns +// as soon as a single sendmsg call succeeds. +func (w *SocketWriter) WriteVec(bufs [][]byte) (int, error) { + iovecs, _ := buildIovec(bufs, make([]syscall.Iovec, 0, 2)) + + if w.race != nil { + // See comments on Socket.race. + atomic.AddInt32(w.race, 1) + } + + var msg syscall.Msghdr + if len(w.to) != 0 { + msg.Name = &w.to[0] + msg.Namelen = uint32(len(w.to)) + } + + if len(w.ControlMessage) != 0 { + msg.Control = &w.ControlMessage[0] + msg.Controllen = uint64(len(w.ControlMessage)) + } + + if len(iovecs) > 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + fd, ok := w.socket.enterFD() + if !ok { + return 0, syscall.EBADF + } + // Leave on returns below. + for { + // Try a non-blocking send first, so we don't give up the go runtime M. + n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL) + if e == 0 { + w.socket.gate.Leave() + return int(n), nil + } + if e == syscall.EINTR { + continue + } + if !w.blocking { + w.socket.gate.Leave() + return 0, e + } + if e != syscall.EAGAIN && e != syscall.EWOULDBLOCK { + w.socket.gate.Leave() + return 0, e + } + + // Wait for the socket to become writeable. + err := w.socket.wait(true) + if err == errClosing { + err = syscall.EBADF + } + if err != nil { + w.socket.gate.Leave() + return 0, err + } + } + // Unreachable, no s.gate.Leave needed. +} + +// getsockopt issues a getsockopt syscall. +func getsockopt(fd int, level int, optname int, buf []byte) (uint32, error) { + l := uint32(len(buf)) + _, _, e := syscall.RawSyscall6(syscall.SYS_GETSOCKOPT, uintptr(fd), uintptr(level), uintptr(optname), uintptr(unsafe.Pointer(&buf[0])), uintptr(unsafe.Pointer(&l)), 0) + if e != 0 { + return 0, e + } + + return l, nil +} + +// setsockopt issues a setsockopt syscall. +func setsockopt(fd int, level int, optname int, buf []byte) error { + _, _, e := syscall.RawSyscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(optname), uintptr(unsafe.Pointer(&buf[0])), uintptr(len(buf)), 0) + if e != 0 { + return e + } + + return nil +} + +// getsockname issues a getsockname syscall. +func getsockname(fd int, buf []byte) (uint32, error) { + l := uint32(len(buf)) + _, _, e := syscall.RawSyscall(syscall.SYS_GETSOCKNAME, uintptr(fd), uintptr(unsafe.Pointer(&buf[0])), uintptr(unsafe.Pointer(&l))) + if e != 0 { + return 0, e + } + + return l, nil +} + +// getpeername issues a getpeername syscall. +func getpeername(fd int, buf []byte) (uint32, error) { + l := uint32(len(buf)) + _, _, e := syscall.RawSyscall(syscall.SYS_GETPEERNAME, uintptr(fd), uintptr(unsafe.Pointer(&buf[0])), uintptr(unsafe.Pointer(&l))) + if e != 0 { + return 0, e + } + + return l, nil +} |