diff options
Diffstat (limited to 'pkg/sentry/socket/hostinet')
-rw-r--r-- | pkg/sentry/socket/hostinet/BUILD | 53 | ||||
-rw-r--r-- | pkg/sentry/socket/hostinet/device.go | 19 | ||||
-rw-r--r-- | pkg/sentry/socket/hostinet/hostinet.go | 17 | ||||
-rw-r--r-- | pkg/sentry/socket/hostinet/save_restore.go | 20 | ||||
-rw-r--r-- | pkg/sentry/socket/hostinet/socket.go | 562 | ||||
-rw-r--r-- | pkg/sentry/socket/hostinet/socket_unsafe.go | 138 | ||||
-rw-r--r-- | pkg/sentry/socket/hostinet/stack.go | 244 |
7 files changed, 1053 insertions, 0 deletions
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD new file mode 100644 index 000000000..60ec265ba --- /dev/null +++ b/pkg/sentry/socket/hostinet/BUILD @@ -0,0 +1,53 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "hostinet_state", + srcs = [ + "save_restore.go", + "socket.go", + "stack.go", + ], + out = "hostinet_autogen_state.go", + package = "hostinet", +) + +go_library( + name = "hostinet", + srcs = [ + "device.go", + "hostinet.go", + "hostinet_autogen_state.go", + "save_restore.go", + "socket.go", + "socket_unsafe.go", + "stack.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/safemem", + "//pkg/sentry/socket", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserr", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + "//pkg/waiter/fdnotifier", + ], +) diff --git a/pkg/sentry/socket/hostinet/device.go b/pkg/sentry/socket/hostinet/device.go new file mode 100644 index 000000000..a9a673316 --- /dev/null +++ b/pkg/sentry/socket/hostinet/device.go @@ -0,0 +1,19 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +var socketDevice = device.NewAnonDevice() diff --git a/pkg/sentry/socket/hostinet/hostinet.go b/pkg/sentry/socket/hostinet/hostinet.go new file mode 100644 index 000000000..67c6c8066 --- /dev/null +++ b/pkg/sentry/socket/hostinet/hostinet.go @@ -0,0 +1,17 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostinet implements AF_INET and AF_INET6 sockets using the host's +// network stack. +package hostinet diff --git a/pkg/sentry/socket/hostinet/save_restore.go b/pkg/sentry/socket/hostinet/save_restore.go new file mode 100644 index 000000000..0821a794a --- /dev/null +++ b/pkg/sentry/socket/hostinet/save_restore.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +// beforeSave is invoked by stateify. +func (*socketOperations) beforeSave() { + panic("host.socketOperations is not savable") +} diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go new file mode 100644 index 000000000..defa3db2c --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket.go @@ -0,0 +1,562 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +const ( + sizeofInt32 = 4 + + // sizeofSockaddr is the size in bytes of the largest sockaddr type + // supported by this package. + sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in) +) + +// socketOperations implements fs.FileOperations and socket.Socket for a socket +// implemented using a host socket. +type socketOperations struct { + socket.ReceiveTimeout + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + + fd int // must be O_NONBLOCK + queue waiter.Queue +} + +func newSocketFile(ctx context.Context, fd int, nonblock bool) (*fs.File, *syserr.Error) { + s := &socketOperations{fd: fd} + if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { + return nil, syserr.FromError(err) + } + dirent := socket.NewDirent(ctx, socketDevice) + return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true}, s), nil +} + +// Release implements fs.FileOperations.Release. +func (s *socketOperations) Release() { + fdnotifier.RemoveFD(int32(s.fd)) + syscall.Close(s.fd) +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(s.fd), mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(s.fd)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketOperations) EventUnregister(e *waiter.Entry) { + s.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(s.fd)) +} + +// Read implements fs.FileOperations.Read. +func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of dst.Addrs was unusable. + if uint64(dst.NumBytes()) != dsts.NumBytes() { + return 0, nil + } + if dsts.IsEmpty() { + return 0, nil + } + if dsts.NumBlocks() == 1 { + // Skip allocating []syscall.Iovec. + n, err := syscall.Read(s.fd, dsts.Head().ToSlice()) + if err != nil { + return 0, translateIOSyscallError(err) + } + return uint64(n), nil + } + return readv(s.fd, iovecsFromBlockSeq(dsts)) + })) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of src.Addrs was unusable. + if uint64(src.NumBytes()) != srcs.NumBytes() { + return 0, nil + } + if srcs.IsEmpty() { + return 0, nil + } + if srcs.NumBlocks() == 1 { + // Skip allocating []syscall.Iovec. + n, err := syscall.Write(s.fd, srcs.Head().ToSlice()) + if err != nil { + return 0, translateIOSyscallError(err) + } + return uint64(n), nil + } + return writev(s.fd, iovecsFromBlockSeq(srcs)) + })) + return int64(n), err +} + +// Connect implements socket.Socket.Connect. +func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { + if len(sockaddr) > sizeofSockaddr { + sockaddr = sockaddr[:sizeofSockaddr] + } + + _, _, errno := syscall.Syscall(syscall.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) + + if errno == 0 { + return nil + } + if errno != syscall.EINPROGRESS || !blocking { + return syserr.FromError(translateIOSyscallError(errno)) + } + + // "EINPROGRESS: The socket is nonblocking and the connection cannot be + // completed immediately. It is possible to select(2) or poll(2) for + // completion by selecting the socket for writing. After select(2) + // indicates writability, use getsockopt(2) to read the SO_ERROR option at + // level SOL-SOCKET to determine whether connect() completed successfully + // (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error + // codes listed here, explaining the reason for the failure)." - connect(2) + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + if s.Readiness(waiter.EventOut)&waiter.EventOut == 0 { + if err := t.Block(ch); err != nil { + return syserr.FromError(err) + } + } + val, err := syscall.GetsockoptInt(s.fd, syscall.SOL_SOCKET, syscall.SO_ERROR) + if err != nil { + return syserr.FromError(err) + } + if val != 0 { + return syserr.FromError(syscall.Errno(uintptr(val))) + } + return nil +} + +// Accept implements socket.Socket.Accept. +func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (kdefs.FD, interface{}, uint32, *syserr.Error) { + var peerAddr []byte + var peerAddrlen uint32 + var peerAddrPtr *byte + var peerAddrlenPtr *uint32 + if peerRequested { + peerAddr = make([]byte, sizeofSockaddr) + peerAddrlen = uint32(len(peerAddr)) + peerAddrPtr = &peerAddr[0] + peerAddrlenPtr = &peerAddrlen + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. + fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK) + if blocking { + var ch chan struct{} + for syscallErr == syserror.ErrWouldBlock { + if ch != nil { + if syscallErr = t.Block(ch); syscallErr != nil { + break + } + } else { + var e waiter.Entry + e, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + } + fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK) + } + } + + if peerRequested { + peerAddr = peerAddr[:peerAddrlen] + } + if syscallErr != nil { + return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) + } + + f, err := newSocketFile(t, fd, flags&syscall.SOCK_NONBLOCK != 0) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() + + fdFlags := kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + } + kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits()) + return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) +} + +// Bind implements socket.Socket.Bind. +func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + if len(sockaddr) > sizeofSockaddr { + sockaddr = sockaddr[:sizeofSockaddr] + } + + _, _, errno := syscall.Syscall(syscall.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) + if errno != 0 { + return syserr.FromError(errno) + } + return nil +} + +// Listen implements socket.Socket.Listen. +func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { + return syserr.FromError(syscall.Listen(s.fd, backlog)) +} + +// Shutdown implements socket.Socket.Shutdown. +func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { + switch how { + case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR: + return syserr.FromError(syscall.Shutdown(s.fd, how)) + default: + return syserr.ErrInvalidArgument + } +} + +// GetSockOpt implements socket.Socket.GetSockOpt. +func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) { + if outLen < 0 { + return nil, syserr.ErrInvalidArgument + } + + // Whitelist options and constrain option length. + var optlen int + switch level { + case syscall.SOL_IPV6: + switch name { + case syscall.IPV6_V6ONLY: + optlen = sizeofInt32 + } + case syscall.SOL_SOCKET: + switch name { + case syscall.SO_ERROR, syscall.SO_KEEPALIVE, syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR, syscall.SO_TYPE: + optlen = sizeofInt32 + case syscall.SO_LINGER: + optlen = syscall.SizeofLinger + } + case syscall.SOL_TCP: + switch name { + case syscall.TCP_NODELAY: + optlen = sizeofInt32 + case syscall.TCP_INFO: + optlen = int(linux.SizeOfTCPInfo) + } + } + if optlen == 0 { + return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT + } + if outLen < optlen { + return nil, syserr.ErrInvalidArgument + } + + opt, err := getsockopt(s.fd, level, name, optlen) + if err != nil { + return nil, syserr.FromError(err) + } + return opt, nil +} + +// SetSockOpt implements socket.Socket.SetSockOpt. +func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { + // Whitelist options and constrain option length. + var optlen int + switch level { + case syscall.SOL_IPV6: + switch name { + case syscall.IPV6_V6ONLY: + optlen = sizeofInt32 + } + case syscall.SOL_SOCKET: + switch name { + case syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR: + optlen = sizeofInt32 + } + case syscall.SOL_TCP: + switch name { + case syscall.TCP_NODELAY: + optlen = sizeofInt32 + } + } + if optlen == 0 { + // Pretend to accept socket options we don't understand. This seems + // dangerous, but it's what netstack does... + return nil + } + if len(opt) < optlen { + return syserr.ErrInvalidArgument + } + opt = opt[:optlen] + + _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(s.fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(len(opt)), 0) + if errno != 0 { + return syserr.FromError(errno) + } + return nil +} + +// RecvMsg implements socket.Socket.RecvMsg. +func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, interface{}, uint32, unix.ControlMessages, *syserr.Error) { + // Whitelist flags. + // + // FIXME: We can't support MSG_ERRQUEUE because it uses ancillary + // messages that netstack/tcpip/transport/unix doesn't understand. Kill the + // Socket interface's dependence on netstack. + if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 { + return 0, nil, 0, unix.ControlMessages{}, syserr.ErrInvalidArgument + } + + var senderAddr []byte + if senderRequested { + senderAddr = make([]byte, sizeofSockaddr) + } + + recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of dst.Addrs was unusable. + if uint64(dst.NumBytes()) != dsts.NumBytes() { + return 0, nil + } + if dsts.IsEmpty() { + return 0, nil + } + + // We always do a non-blocking recv*(). + sysflags := flags | syscall.MSG_DONTWAIT + + if dsts.NumBlocks() == 1 { + // Skip allocating []syscall.Iovec. + return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddr) + } + + iovs := iovecsFromBlockSeq(dsts) + msg := syscall.Msghdr{ + Iov: &iovs[0], + Iovlen: uint64(len(iovs)), + } + if len(senderAddr) != 0 { + msg.Name = &senderAddr[0] + msg.Namelen = uint32(len(senderAddr)) + } + n, err := recvmsg(s.fd, &msg, sysflags) + if err != nil { + return 0, err + } + senderAddr = senderAddr[:msg.Namelen] + return n, nil + }) + + var ch chan struct{} + n, err := dst.CopyOutFrom(t, recvmsgToBlocks) + if flags&syscall.MSG_DONTWAIT == 0 { + for err == syserror.ErrWouldBlock { + // We only expect blocking to come from the actual syscall, in which + // case it can't have returned any data. + if n != 0 { + panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err)) + } + if ch != nil { + if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + break + } + } else { + var e waiter.Entry + e, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventIn) + defer s.EventUnregister(&e) + } + n, err = dst.CopyOutFrom(t, recvmsgToBlocks) + } + } + + return int(n), senderAddr, uint32(len(senderAddr)), unix.ControlMessages{}, syserr.FromError(err) +} + +// SendMsg implements socket.Socket.SendMsg. +func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, controlMessages unix.ControlMessages) (int, *syserr.Error) { + // Whitelist flags. + if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { + return 0, syserr.ErrInvalidArgument + } + + sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { + // Refuse to do anything if any part of src.Addrs was unusable. + if uint64(src.NumBytes()) != srcs.NumBytes() { + return 0, nil + } + if srcs.IsEmpty() { + return 0, nil + } + + // We always do a non-blocking send*(). + sysflags := flags | syscall.MSG_DONTWAIT + + if srcs.NumBlocks() == 1 { + // Skip allocating []syscall.Iovec. + src := srcs.Head() + n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil + } + + iovs := iovecsFromBlockSeq(srcs) + msg := syscall.Msghdr{ + Iov: &iovs[0], + Iovlen: uint64(len(iovs)), + } + if len(to) != 0 { + msg.Name = &to[0] + msg.Namelen = uint32(len(to)) + } + return sendmsg(s.fd, &msg, sysflags) + }) + + var ch chan struct{} + n, err := src.CopyInTo(t, sendmsgFromBlocks) + if flags&syscall.MSG_DONTWAIT == 0 { + for err == syserror.ErrWouldBlock { + // We only expect blocking to come from the actual syscall, in which + // case it can't have returned any data. + if n != 0 { + panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err)) + } + if ch != nil { + if err = t.Block(ch); err != nil { + break + } + } else { + var e waiter.Entry + e, ch = waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventOut) + defer s.EventUnregister(&e) + } + n, err = src.CopyInTo(t, sendmsgFromBlocks) + } + } + + return int(n), syserr.FromError(err) +} + +func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec { + iovs := make([]syscall.Iovec, 0, bs.NumBlocks()) + for ; !bs.IsEmpty(); bs = bs.Tail() { + b := bs.Head() + iovs = append(iovs, syscall.Iovec{ + Base: &b.ToSlice()[0], + Len: uint64(b.Len()), + }) + // We don't need to care about b.NeedSafecopy(), because the host + // kernel will handle such address ranges just fine (by returning + // EFAULT). + } + return iovs +} + +func translateIOSyscallError(err error) error { + if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +type socketProvider struct { + family int +} + +// Socket implements socket.Provider.Socket. +func (p *socketProvider) Socket(t *kernel.Task, stypeflags unix.SockType, protocol int) (*fs.File, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := int(stypeflags) & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newSocketFile(t, fd, stypeflags&syscall.SOCK_NONBLOCK != 0) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProvider) Pair(t *kernel.Task, stype unix.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} + +func init() { + for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { + socket.RegisterProvider(family, &socketProvider{family}) + } +} diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go new file mode 100644 index 000000000..f8bb75636 --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -0,0 +1,138 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func firstBytePtr(bs []byte) unsafe.Pointer { + if bs == nil { + return nil + } + return unsafe.Pointer(&bs[0]) +} + +// Preconditions: len(dsts) != 0. +func readv(fd int, dsts []syscall.Iovec) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&dsts[0])), uintptr(len(dsts))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} + +// Preconditions: len(srcs) != 0. +func writev(fd int, srcs []syscall.Iovec) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&srcs[0])), uintptr(len(srcs))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := uintptr(args[1].Int()); cmd { + case syscall.TIOCINQ, syscall.TIOCOUTQ: + var val int32 + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s.fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { + return 0, translateIOSyscallError(errno) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], uint32(val)) + _, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func accept4(fd int, addr *byte, addrlen *uint32, flags int) (int, error) { + afd, _, errno := syscall.Syscall6(syscall.SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(addr)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return int(afd), nil +} + +func getsockopt(fd int, level, name int, optlen int) ([]byte, error) { + opt := make([]byte, optlen) + optlen32 := int32(len(opt)) + _, _, errno := syscall.Syscall6(syscall.SYS_GETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(unsafe.Pointer(&optlen32)), 0) + if errno != 0 { + return nil, errno + } + return opt[:optlen32], nil +} + +// GetSockName implements socket.Socket.GetSockName. +func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) { + addr := make([]byte, sizeofSockaddr) + addrlen := uint32(len(addr)) + _, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) + if errno != 0 { + return nil, 0, syserr.FromError(errno) + } + return addr[:addrlen], addrlen, nil +} + +// GetPeerName implements socket.Socket.GetPeerName. +func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) { + addr := make([]byte, sizeofSockaddr) + addrlen := uint32(len(addr)) + _, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) + if errno != 0 { + return nil, 0, syserr.FromError(errno) + } + return addr[:addrlen], addrlen, nil +} + +func recvfrom(fd int, dst []byte, flags int, from *[]byte) (uint64, error) { + fromLen := uint32(len(*from)) + n, _, errno := syscall.Syscall6(syscall.SYS_RECVFROM, uintptr(fd), uintptr(firstBytePtr(dst)), uintptr(len(dst)), uintptr(flags), uintptr(firstBytePtr(*from)), uintptr(unsafe.Pointer(&fromLen))) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + *from = (*from)[:fromLen] + return uint64(n), nil +} + +func recvmsg(fd int, msg *syscall.Msghdr, flags int) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} + +func sendmsg(fd int, msg *syscall.Msghdr, flags int) (uint64, error) { + n, _, errno := syscall.Syscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + if errno != 0 { + return 0, translateIOSyscallError(errno) + } + return uint64(n), nil +} diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go new file mode 100644 index 000000000..44c3b9a3f --- /dev/null +++ b/pkg/sentry/socket/hostinet/stack.go @@ -0,0 +1,244 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "fmt" + "io/ioutil" + "os" + "strings" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/binary" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +var defaultRecvBufSize = inet.TCPBufferSize{ + Min: 4096, + Default: 87380, + Max: 6291456, +} + +var defaultSendBufSize = inet.TCPBufferSize{ + Min: 4096, + Default: 16384, + Max: 4194304, +} + +// Stack implements inet.Stack for host sockets. +type Stack struct { + // Stack is immutable. + interfaces map[int32]inet.Interface + interfaceAddrs map[int32][]inet.InterfaceAddr + supportsIPv6 bool + tcpRecvBufSize inet.TCPBufferSize + tcpSendBufSize inet.TCPBufferSize + tcpSACKEnabled bool +} + +// NewStack returns an empty Stack containing no configuration. +func NewStack() *Stack { + return &Stack{ + interfaces: make(map[int32]inet.Interface), + interfaceAddrs: make(map[int32][]inet.InterfaceAddr), + } +} + +// Configure sets up the stack using the current state of the host network. +func (s *Stack) Configure() error { + if err := addHostInterfaces(s); err != nil { + return err + } + + if _, err := os.Stat("/proc/net/if_inet6"); err == nil { + s.supportsIPv6 = true + } + + s.tcpRecvBufSize = defaultRecvBufSize + if tcpRMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_rmem"); err == nil { + s.tcpRecvBufSize = tcpRMem + } else { + log.Warningf("Failed to read TCP receive buffer size, using default values") + } + + s.tcpSendBufSize = defaultSendBufSize + if tcpWMem, err := readTCPBufferSizeFile("/proc/sys/net/ipv4/tcp_wmem"); err == nil { + s.tcpSendBufSize = tcpWMem + } else { + log.Warningf("Failed to read TCP send buffer size, using default values") + } + + s.tcpSACKEnabled = false + if sack, err := ioutil.ReadFile("/proc/sys/net/ipv4/tcp_sack"); err == nil { + s.tcpSACKEnabled = strings.TrimSpace(string(sack)) != "0" + } else { + log.Warningf("Failed to read if TCP SACK if enabled, setting to false") + } + + return nil +} + +// ExtractHostInterfaces will populate an interface map and +// interfaceAddrs map with the results of the equivalent +// netlink messages. +func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.NetlinkMessage, interfaces map[int32]inet.Interface, interfaceAddrs map[int32][]inet.InterfaceAddr) error { + for _, link := range links { + if link.Header.Type != syscall.RTM_NEWLINK { + continue + } + if len(link.Data) < syscall.SizeofIfInfomsg { + return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid data length (%d bytes, expected at least %d bytes)", len(link.Data), syscall.SizeofIfInfomsg) + } + var ifinfo syscall.IfInfomsg + binary.Unmarshal(link.Data[:syscall.SizeofIfInfomsg], usermem.ByteOrder, &ifinfo) + inetIF := inet.Interface{ + DeviceType: ifinfo.Type, + Flags: ifinfo.Flags, + } + // Not clearly documented: syscall.ParseNetlinkRouteAttr will check the + // syscall.NetlinkMessage.Header.Type and skip the struct ifinfomsg + // accordingly. + attrs, err := syscall.ParseNetlinkRouteAttr(&link) + if err != nil { + return fmt.Errorf("RTM_GETLINK returned RTM_NEWLINK message with invalid rtattrs: %v", err) + } + for _, attr := range attrs { + switch attr.Attr.Type { + case syscall.IFLA_ADDRESS: + inetIF.Addr = attr.Value + case syscall.IFLA_IFNAME: + inetIF.Name = string(attr.Value[:len(attr.Value)-1]) + } + } + interfaces[ifinfo.Index] = inetIF + } + + for _, addr := range addrs { + if addr.Header.Type != syscall.RTM_NEWADDR { + continue + } + if len(addr.Data) < syscall.SizeofIfAddrmsg { + return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid data length (%d bytes, expected at least %d bytes)", len(addr.Data), syscall.SizeofIfAddrmsg) + } + var ifaddr syscall.IfAddrmsg + binary.Unmarshal(addr.Data[:syscall.SizeofIfAddrmsg], usermem.ByteOrder, &ifaddr) + inetAddr := inet.InterfaceAddr{ + Family: ifaddr.Family, + PrefixLen: ifaddr.Prefixlen, + Flags: ifaddr.Flags, + } + attrs, err := syscall.ParseNetlinkRouteAttr(&addr) + if err != nil { + return fmt.Errorf("RTM_GETADDR returned RTM_NEWADDR message with invalid rtattrs: %v", err) + } + for _, attr := range attrs { + switch attr.Attr.Type { + case syscall.IFA_ADDRESS: + inetAddr.Addr = attr.Value + } + } + interfaceAddrs[int32(ifaddr.Index)] = append(interfaceAddrs[int32(ifaddr.Index)], inetAddr) + } + + return nil +} + +func addHostInterfaces(s *Stack) error { + links, err := doNetlinkRouteRequest(syscall.RTM_GETLINK) + if err != nil { + return fmt.Errorf("RTM_GETLINK failed: %v", err) + } + + addrs, err := doNetlinkRouteRequest(syscall.RTM_GETADDR) + if err != nil { + return fmt.Errorf("RTM_GETADDR failed: %v", err) + } + + return ExtractHostInterfaces(links, addrs, s.interfaces, s.interfaceAddrs) +} + +func doNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) { + data, err := syscall.NetlinkRIB(req, syscall.AF_UNSPEC) + if err != nil { + return nil, err + } + return syscall.ParseNetlinkMessage(data) +} + +func readTCPBufferSizeFile(filename string) (inet.TCPBufferSize, error) { + contents, err := ioutil.ReadFile(filename) + if err != nil { + return inet.TCPBufferSize{}, fmt.Errorf("failed to read %s: %v", filename, err) + } + ioseq := usermem.BytesIOSequence(contents) + fields := make([]int32, 3) + if n, err := usermem.CopyInt32StringsInVec(context.Background(), ioseq.IO, ioseq.Addrs, fields, ioseq.Opts); n != ioseq.NumBytes() || err != nil { + return inet.TCPBufferSize{}, fmt.Errorf("failed to parse %s (%q): got %v after %d/%d bytes", filename, contents, err, n, ioseq.NumBytes()) + } + return inet.TCPBufferSize{ + Min: int(fields[0]), + Default: int(fields[1]), + Max: int(fields[2]), + }, nil +} + +// Interfaces implements inet.Stack.Interfaces. +func (s *Stack) Interfaces() map[int32]inet.Interface { + return s.interfaces +} + +// InterfaceAddrs implements inet.Stack.InterfaceAddrs. +func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { + return s.interfaceAddrs +} + +// SupportsIPv6 implements inet.Stack.SupportsIPv6. +func (s *Stack) SupportsIPv6() bool { + return s.supportsIPv6 +} + +// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. +func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { + return s.tcpRecvBufSize, nil +} + +// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. +func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { + return syserror.EACCES +} + +// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. +func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { + return s.tcpSendBufSize, nil +} + +// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. +func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { + return syserror.EACCES +} + +// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. +func (s *Stack) TCPSACKEnabled() (bool, error) { + return s.tcpSACKEnabled, nil +} + +// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. +func (s *Stack) SetTCPSACKEnabled(enabled bool) error { + return syserror.EACCES +} |