diff options
Diffstat (limited to 'pkg/sentry/socket')
33 files changed, 2525 insertions, 448 deletions
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 611fa22c3..c40c6d673 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/tcpip", "//pkg/usermem", diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 4d42d29cb..ca16d0381 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -4,7 +4,10 @@ package(licenses = ["notice"]) go_library( name = "control", - srcs = ["control.go"], + srcs = [ + "control.go", + "control_vfs2.go", + ], imports = [ "gvisor.dev/gvisor/pkg/sentry/fs", ], @@ -18,6 +21,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 4667373d2..8b439a078 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -41,6 +41,8 @@ type SCMCredentials interface { Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) } +// LINT.IfChange + // SCMRights represents a SCM_RIGHTS socket control message. type SCMRights interface { transport.RightsControlMessage @@ -142,6 +144,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flag return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) } +// LINT.ThenChange(./control_vfs2.go) + // scmCredentials represents an SCM_CREDENTIALS socket control message. // // +stateify savable @@ -329,7 +333,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte { } // PackTClass packs an IPV6_TCLASS socket control message. -func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte { +func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IPV6, @@ -537,11 +541,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con } if len(fds) > 0 { - rights, err := NewSCMRights(t, fds) - if err != nil { - return socket.ControlMessages{}, err + if kernel.VFS2Enabled { + rights, err := NewSCMRightsVFS2(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights + } else { + rights, err := NewSCMRights(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights } - cmsgs.Unix.Rights = rights } return cmsgs, nil @@ -566,6 +578,8 @@ func MakeCreds(t *kernel.Task) SCMCredentials { return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} } +// LINT.IfChange + // New creates default control messages if needed. func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages { return transport.ControlMessages{ @@ -573,3 +587,5 @@ func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transpo Rights: rights, } } + +// LINT.ThenChange(./control_vfs2.go) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go new file mode 100644 index 000000000..fd08179be --- /dev/null +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SCMRightsVFS2 represents a SCM_RIGHTS socket control message. +type SCMRightsVFS2 interface { + transport.RightsControlMessage + + // Files returns up to max RightsFiles. + // + // Returned files are consumed and ownership is transferred to the caller. + // Subsequent calls to Files will return the next files. + Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool) +} + +// RightsFiles represents a SCM_RIGHTS socket control message. A reference is +// maintained for each vfs.FileDescription and is release either when an FD is created or +// when the Release method is called. +type RightsFilesVFS2 []*vfs.FileDescription + +// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message +// representation using local sentry FDs. +func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { + files := make(RightsFilesVFS2, 0, len(fds)) + for _, fd := range fds { + file := t.GetFileVFS2(fd) + if file == nil { + files.Release() + return nil, syserror.EBADF + } + files = append(files, file) + } + return &files, nil +} + +// Files implements SCMRights.Files. +func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(*fs); n > l { + n = l + } else if n < l { + trunc = true + } + rf := (*fs)[:n] + *fs = (*fs)[n:] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { + nfs := append(RightsFilesVFS2(nil), *fs...) + for _, nf := range nfs { + nf.IncRef() + } + return &nfs +} + +// Release implements transport.RightsControlMessage.Release. +func (fs *RightsFilesVFS2) Release() { + for _, f := range *fs { + f.DecRef() + } + *fs = nil +} + +// rightsFDsVFS2 gets up to the specified maximum number of FDs. +func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) { + files, trunc := rights.Files(t, max) + fds := make([]int32, 0, len(files)) + for i := 0; i < max && len(files) > 0; i++ { + fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) + files[0].DecRef() + files = files[1:] + if err != nil { + t.Warningf("Error inserting FD: %v", err) + // This is what Linux does. + break + } + + fds = append(fds, int32(fd)) + } + return fds, trunc +} + +// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf. +func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) { + maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4 + // Linux does not return any FDs if none fit. + if maxFDs <= 0 { + flags |= linux.MSG_CTRUNC + return buf, flags + } + fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs) + if trunc { + flags |= linux.MSG_CTRUNC + } + align := t.Arch().Width() + return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) +} + +// NewVFS2 creates default control messages if needed. +func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages { + return transport.ControlMessages{ + Credentials: makeCreds(t, socketOrEndpoint), + Rights: rights, + } +} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 023bad156..e82d6cd1e 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -10,6 +10,7 @@ go_library( "save_restore.go", "socket.go", "socket_unsafe.go", + "socket_vfs2.go", "sockopt_impl.go", "stack.go", ], @@ -25,11 +26,14 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/hostfd", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/control", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 22f78d2e2..c11e82c10 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -49,6 +50,8 @@ const ( maxControlLen = 1024 ) +// LINT.IfChange + // socketOperations implements fs.FileOperations and socket.Socket for a socket // implemented using a host socket. type socketOperations struct { @@ -59,23 +62,37 @@ type socketOperations struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout family int // Read-only. stype linux.SockType // Read-only. protocol int // Read-only. - fd int // must be O_NONBLOCK queue waiter.Queue + + // fd is the host socket fd. It must have O_NONBLOCK, so that operations + // will return EWOULDBLOCK instead of blocking on the host. This allows us to + // handle blocking behavior independently in the sentry. + fd int } var _ = socket.Socket(&socketOperations{}) func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { s := &socketOperations{ - family: family, - stype: stype, - protocol: protocol, - fd: fd, + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, } if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) @@ -86,28 +103,33 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc } // Release implements fs.FileOperations.Release. -func (s *socketOperations) Release() { +func (s *socketOpsCommon) Release() { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } // Readiness implements waiter.Waitable.Readiness. -func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(s.fd), mask) } // EventRegister implements waiter.Waitable.EventRegister. -func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.queue.EventRegister(e, mask) fdnotifier.UpdateFD(int32(s.fd)) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *socketOperations) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.queue.EventUnregister(e) fdnotifier.UpdateFD(int32(s.fd)) } +// Ioctl implements fs.FileOperations.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, io, args) +} + // Read implements fs.FileOperations.Read. func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { @@ -155,7 +177,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } // Connect implements socket.Socket.Connect. -func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -195,7 +217,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { var peerAddr linux.SockAddr var peerAddrBuf []byte var peerAddrlen uint32 @@ -209,7 +231,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } // Conservatively ignore all flags specified by the application and add - // SOCK_NONBLOCK since socketOperations requires it. + // SOCK_NONBLOCK since socketOpsCommon requires it. fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC) if blocking { var ch chan struct{} @@ -235,23 +257,41 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) } - f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) - if err != nil { - syscall.Close(fd) - return 0, nil, 0, err - } - defer f.DecRef() + var ( + kfd int32 + kerr error + ) + if kernel.VFS2Enabled { + f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&syscall.SOCK_NONBLOCK)) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() - kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ - CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, - }) - t.Kernel().RecordSocket(f) + kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocketVFS2(f) + } else { + f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() + + kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocket(f) + } return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } // Bind implements socket.Socket.Bind. -func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -264,12 +304,12 @@ func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Listen implements socket.Socket.Listen. -func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return syserr.FromError(syscall.Listen(s.fd, backlog)) } // Shutdown implements socket.Socket.Shutdown. -func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { switch how { case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR: return syserr.FromError(syscall.Shutdown(s.fd, how)) @@ -279,7 +319,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } @@ -328,7 +368,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt } // SetSockOpt implements socket.Socket.SetSockOpt. -func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { // Whitelist options and constrain option length. optlen := setSockOptLen(t, level, name) switch level { @@ -374,7 +414,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ } // RecvMsg implements socket.Socket.RecvMsg. -func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Whitelist flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary @@ -496,7 +536,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } // SendMsg implements socket.Socket.SendMsg. -func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Whitelist flags. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { return 0, syserr.ErrInvalidArgument @@ -515,7 +555,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] if uint64(src.NumBytes()) != srcs.NumBytes() { return 0, nil } - if srcs.IsEmpty() { + if srcs.IsEmpty() && len(controlBuf) == 0 { return 0, nil } @@ -585,7 +625,7 @@ func translateIOSyscallError(err error) error { } // State implements socket.Socket.State. -func (s *socketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { info := linux.TCPInfo{} buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo) if err != nil { @@ -607,7 +647,7 @@ func (s *socketOperations) State() uint32 { } // Type implements socket.Socket.Type. -func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.stype, s.protocol } @@ -663,8 +703,11 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int return nil, nil, nil } +// LINT.ThenChange(./socket_vfs2.go) + func init() { for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { socket.RegisterProvider(family, &socketProvider{family}) + socket.RegisterProviderVFS2(family, &socketProviderVFS2{}) } } diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index cd67234d2..3f420c2ec 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/syserr" @@ -54,12 +53,11 @@ func writev(fd int, srcs []syscall.Iovec) (uint64, error) { return uint64(n), nil } -// Ioctl implements fs.FileOperations.Ioctl. -func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := uintptr(args[1].Int()); cmd { case syscall.TIOCINQ, syscall.TIOCOUTQ: var val int32 - if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s.fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { return 0, translateIOSyscallError(errno) } var buf [4]byte @@ -93,7 +91,7 @@ func getsockopt(fd int, level, name int, optlen int) ([]byte, error) { } // GetSockName implements socket.Socket.GetSockName. -func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) @@ -104,7 +102,7 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, } // GetPeerName implements socket.Socket.GetPeerName. -func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go new file mode 100644 index 000000000..677743113 --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -0,0 +1,184 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +type socketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + // We store metadata for hostinet sockets internally. Technically, we should + // access metadata (e.g. through stat, chmod) on the host for correctness, + // but this is not very useful for inet socket fds, which do not belong to a + // concrete file anyway. + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&socketVFS2{}) + +func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + + s := &socketVFS2{ + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, + } + if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { + return nil, syserr.FromError(err) + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) + return int64(n), err +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) + return int64(n), err +} + +type socketProviderVFS2 struct { + family int +} + +// Socket implements socket.ProviderVFS2.Socket. +func (p *socketProviderVFS2) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := stypeflags & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newVFS2Socket(t, p.family, stype, protocol, fd, uint32(stypeflags&syscall.SOCK_NONBLOCK)) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProviderVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index c91ec7494..721094bbf 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -7,6 +7,8 @@ go_library( srcs = [ "extensions.go", "netfilter.go", + "owner_matcher.go", + "targets.go", "tcp_matcher.go", "udp_matcher.go", ], @@ -21,7 +23,6 @@ go_library( "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/header", - "//pkg/tcpip/iptables", "//pkg/tcpip/stack", "//pkg/usermem", ], diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go index b4b244abf..0336a32d8 100644 --- a/pkg/sentry/socket/netfilter/extensions.go +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -37,12 +37,12 @@ type matchMaker interface { // name is the matcher name as stored in the xt_entry_match struct. name() string - // marshal converts from an iptables.Matcher to an ABI struct. - marshal(matcher iptables.Matcher) []byte + // marshal converts from an stack.Matcher to an ABI struct. + marshal(matcher stack.Matcher) []byte // unmarshal converts from the ABI matcher struct to an - // iptables.Matcher. - unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) + // stack.Matcher. + unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) } // matchMakers maps the name of supported matchers to the matchMaker that @@ -58,7 +58,7 @@ func registerMatchMaker(mm matchMaker) { matchMakers[mm.name()] = mm } -func marshalMatcher(matcher iptables.Matcher) []byte { +func marshalMatcher(matcher stack.Matcher) []byte { matchMaker, ok := matchMakers[matcher.Name()] if !ok { panic(fmt.Sprintf("Unknown matcher of type %T.", matcher)) @@ -86,7 +86,7 @@ func marshalEntryMatch(name string, data []byte) []byte { return append(buf, make([]byte, size-len(buf))...) } -func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) { +func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { matchMaker, ok := matchMakers[match.Name.String()] if !ok { return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 3fc80e0de..66015e2bc 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,6 +17,7 @@ package netfilter import ( + "bytes" "errors" "fmt" @@ -26,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -35,6 +36,11 @@ import ( // shouldn't be reached - an error has occurred if we fall through to one. const errorTargetName = "ERROR" +// redirectTargetName is used to mark targets as redirect targets. Redirect +// targets should be reached for only NAT and Mangle tables. These targets will +// change the destination port/destination IP for packets. +const redirectTargetName = "REDIRECT" + // Metadata is used to verify that we are correctly serializing and // deserializing iptables into structs consumable by the iptables tool. We save // a metadata struct when the tables are written, and when they are read out we @@ -48,9 +54,23 @@ type metadata struct { Size uint32 } +// enableLogging controls whether to log the (de)serialization of netfilter +// structs between userspace and netstack. These logs are useful when +// developing iptables, but can pollute sentry logs otherwise. +const enableLogging = false + +// emptyFilter is for comparison with a rule's filters to determine whether it +// is also empty. It is immutable. +var emptyFilter = stack.IPHeaderFilter{ + Dst: "\x00\x00\x00\x00", + DstMask: "\x00\x00\x00\x00", + Src: "\x00\x00\x00\x00", + SrcMask: "\x00\x00\x00\x00", +} + // nflog logs messages related to the writing and reading of iptables. func nflog(format string, args ...interface{}) { - if log.IsLogging(log.Debug) { + if enableLogging && log.IsLogging(log.Debug) { log.Debugf("netfilter: "+format, args...) } } @@ -123,39 +143,35 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen return entries, nil } -func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) { - ipt := stack.IPTables() - table, ok := ipt.Tables[tablename.String()] +func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) { + table, ok := stk.IPTables().GetTable(tablename.String()) if !ok { - return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename) + return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename) } return table, nil } -// FillDefaultIPTables sets stack's IPTables to the default tables and -// populates them with metadata. -func FillDefaultIPTables(stack *stack.Stack) { - ipt := iptables.DefaultTables() - - // In order to fill in the metadata, we have to translate ipt from its - // netstack format to Linux's giant-binary-blob format. - for name, table := range ipt.Tables { - _, metadata, err := convertNetstackToBinary(name, table) - if err != nil { - panic(fmt.Errorf("Unable to set default IP tables: %v", err)) +// FillIPTablesMetadata populates stack's IPTables with metadata. +func FillIPTablesMetadata(stk *stack.Stack) { + stk.IPTables().ModifyTables(func(tables map[string]stack.Table) { + // In order to fill in the metadata, we have to translate ipt from its + // netstack format to Linux's giant-binary-blob format. + for name, table := range tables { + _, metadata, err := convertNetstackToBinary(name, table) + if err != nil { + panic(fmt.Errorf("Unable to set default IP tables: %v", err)) + } + table.SetMetadata(metadata) + tables[name] = table } - table.SetMetadata(metadata) - ipt.Tables[name] = table - } - - stack.SetIPTables(ipt) + }) } // convertNetstackToBinary converts the iptables as stored in netstack to the // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a bit, reading some offsets, // jumping to those offsets, parsing again, etc. -func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) { +func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) { // Return values. var entries linux.KernelIPTGetEntries var meta metadata @@ -194,6 +210,21 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern TargetOffset: linux.SizeOfIPTEntry, }, } + copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst) + copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask) + copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src) + copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask) + copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface) + copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + if rule.Filter.DstInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP + } + if rule.Filter.SrcInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP + } + if rule.Filter.OutputInterfaceInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT + } for _, matcher := range rule.Matchers { // Serialize the matcher and add it to the @@ -228,25 +259,29 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern return entries, meta, nil } -func marshalTarget(target iptables.Target) []byte { +func marshalTarget(target stack.Target) []byte { switch tg := target.(type) { - case iptables.AcceptTarget: - return marshalStandardTarget(iptables.RuleAccept) - case iptables.DropTarget: - return marshalStandardTarget(iptables.RuleDrop) - case iptables.ErrorTarget: + case stack.AcceptTarget: + return marshalStandardTarget(stack.RuleAccept) + case stack.DropTarget: + return marshalStandardTarget(stack.RuleDrop) + case stack.ErrorTarget: return marshalErrorTarget(errorTargetName) - case iptables.UserChainTarget: + case stack.UserChainTarget: return marshalErrorTarget(tg.Name) - case iptables.ReturnTarget: - return marshalStandardTarget(iptables.RuleReturn) + case stack.ReturnTarget: + return marshalStandardTarget(stack.RuleReturn) + case stack.RedirectTarget: + return marshalRedirectTarget(tg) + case JumpTarget: + return marshalJumpTarget(tg) default: panic(fmt.Errorf("unknown target of type %T", target)) } } -func marshalStandardTarget(verdict iptables.RuleVerdict) []byte { - nflog("convert to binary: marshalling standard target with size %d", linux.SizeOfXTStandardTarget) +func marshalStandardTarget(verdict stack.RuleVerdict) []byte { + nflog("convert to binary: marshalling standard target") // The target's name will be the empty string. target := linux.XTStandardTarget{ @@ -274,15 +309,55 @@ func marshalErrorTarget(errorName string) []byte { return binary.Marshal(ret, usermem.ByteOrder, target) } +func marshalRedirectTarget(rt stack.RedirectTarget) []byte { + // This is a redirect target named redirect + target := linux.XTRedirectTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTRedirectTarget, + }, + } + copy(target.Target.Name[:], redirectTargetName) + + ret := make([]byte, 0, linux.SizeOfXTRedirectTarget) + target.NfRange.RangeSize = 1 + if rt.RangeProtoSpecified { + target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED + } + // Convert port from little endian to big endian. + port := make([]byte, 2) + binary.LittleEndian.PutUint16(port, rt.MinPort) + target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port) + binary.LittleEndian.PutUint16(port, rt.MaxPort) + target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + +func marshalJumpTarget(jt JumpTarget) []byte { + nflog("convert to binary: marshalling jump target") + + // The target's name will be the empty string. + target := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + // Verdict is overloaded by the ABI. When positive, it holds + // the jump offset from the start of the table. + Verdict: int32(jt.Offset), + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + // translateFromStandardVerdict translates verdicts the same way as the iptables // tool. -func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 { +func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 { switch verdict { - case iptables.RuleAccept: + case stack.RuleAccept: return -linux.NF_ACCEPT - 1 - case iptables.RuleDrop: + case stack.RuleDrop: return -linux.NF_DROP - 1 - case iptables.RuleReturn: + case stack.RuleReturn: return linux.NF_RETURN default: // TODO(gvisor.dev/issue/170): Support Jump. @@ -291,18 +366,18 @@ func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 { } // translateToStandardTarget translates from the value in a -// linux.XTStandardTarget to an iptables.Verdict. -func translateToStandardTarget(val int32) (iptables.Target, error) { +// linux.XTStandardTarget to an stack.Verdict. +func translateToStandardTarget(val int32) (stack.Target, error) { // TODO(gvisor.dev/issue/170): Support other verdicts. switch val { case -linux.NF_ACCEPT - 1: - return iptables.AcceptTarget{}, nil + return stack.AcceptTarget{}, nil case -linux.NF_DROP - 1: - return iptables.DropTarget{}, nil + return stack.DropTarget{}, nil case -linux.NF_QUEUE - 1: return nil, errors.New("unsupported iptables verdict QUEUE") case linux.NF_RETURN: - return iptables.ReturnTarget{}, nil + return stack.ReturnTarget{}, nil default: return nil, fmt.Errorf("unknown iptables verdict %d", val) } @@ -310,7 +385,7 @@ func translateToStandardTarget(val int32) (iptables.Target, error) { // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. -func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { +func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // Get the basic rules data (struct ipt_replace). if len(optVal) < linux.SizeOfIPTReplace { nflog("optVal has insufficient size for replace %d", len(optVal)) @@ -322,10 +397,12 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace) // TODO(gvisor.dev/issue/170): Support other tables. - var table iptables.Table + var table stack.Table switch replace.Name.String() { - case iptables.TablenameFilter: - table = iptables.EmptyFilterTable() + case stack.TablenameFilter: + table = stack.EmptyFilterTable() + case stack.TablenameNat: + table = stack.EmptyNatTable() default: nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) return syserr.ErrInvalidArgument @@ -335,7 +412,8 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // Convert input into a list of rules and their offsets. var offset uint32 - var offsets []uint32 + // offsets maps rule byte offsets to their position in table.Rules. + offsets := map[uint32]int{} for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { nflog("set entries: processing entry at offset %d", offset) @@ -384,23 +462,24 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) return syserr.ErrInvalidArgument } - target, err := parseTarget(optVal[:targetSize]) + target, err := parseTarget(filter, optVal[:targetSize]) if err != nil { nflog("failed to parse target: %v", err) return syserr.ErrInvalidArgument } optVal = optVal[targetSize:] - table.Rules = append(table.Rules, iptables.Rule{ + table.Rules = append(table.Rules, stack.Rule{ Filter: filter, Target: target, Matchers: matchers, }) - offsets = append(offsets, offset) + offsets[offset] = int(entryIdx) offset += uint32(entry.NextOffset) if initialOptValLen-len(optVal) != int(entry.NextOffset) { nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + return syserr.ErrInvalidArgument } } @@ -409,23 +488,23 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { for hook, _ := range replace.HookEntry { if table.ValidHooks()&(1<<hook) != 0 { hk := hookFromLinux(hook) - for ruleIdx, offset := range offsets { + for offset, ruleIdx := range offsets { if offset == replace.HookEntry[hook] { table.BuiltinChains[hk] = ruleIdx } if offset == replace.Underflow[hook] { if !validUnderflow(table.Rules[ruleIdx]) { - nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP.") + nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx) return syserr.ErrInvalidArgument } table.Underflows[hk] = ruleIdx } } - if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset { + if ruleIdx := table.BuiltinChains[hk]; ruleIdx == stack.HookUnset { nflog("hook %v is unset.", hk) return syserr.ErrInvalidArgument } - if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset { + if ruleIdx := table.Underflows[hk]; ruleIdx == stack.HookUnset { nflog("underflow %v is unset.", hk) return syserr.ErrInvalidArgument } @@ -434,7 +513,7 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // Add the user chains. for ruleIdx, rule := range table.Rules { - target, ok := rule.Target.(iptables.UserChainTarget) + target, ok := rule.Target.(stack.UserChainTarget) if !ok { continue } @@ -444,22 +523,41 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // - There's some other rule after it. // - There are no matchers. if ruleIdx == len(table.Rules)-1 { - nflog("user chain must have a rule or default policy.") + nflog("user chain must have a rule or default policy") return syserr.ErrInvalidArgument } if len(table.Rules[ruleIdx].Matchers) != 0 { - nflog("user chain's first node must have no matcheres.") + nflog("user chain's first node must have no matchers") return syserr.ErrInvalidArgument } table.UserChains[target.Name] = ruleIdx + 1 } + // Set each jump to point to the appropriate rule. Right now they hold byte + // offsets. + for ruleIdx, rule := range table.Rules { + jump, ok := rule.Target.(JumpTarget) + if !ok { + continue + } + + // Find the rule corresponding to the jump rule offset. + jumpTo, ok := offsets[jump.Offset] + if !ok { + nflog("failed to find a rule to jump to") + return syserr.ErrInvalidArgument + } + jump.RuleNum = jumpTo + rule.Target = jump + table.Rules[ruleIdx] = rule + } + // TODO(gvisor.dev/issue/170): Support other chains. - // Since we only support modifying the INPUT chain right now, make sure - // all other chains point to ACCEPT rules. + // Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now, + // make sure all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { - if hook != iptables.Input { - if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok { + if hook == stack.Forward || hook == stack.Postrouting { + if !isUnconditionalAccept(table.Rules[ruleIdx]) { nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument } @@ -471,24 +569,22 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // - There are no chains without an unconditional final rule. // - There are no chains without an unconditional underflow rule. - ipt := stack.IPTables() table.SetMetadata(metadata{ HookEntry: replace.HookEntry, Underflow: replace.Underflow, NumEntries: replace.NumEntries, Size: replace.Size, }) - ipt.Tables[replace.Name.String()] = table - stack.SetIPTables(ipt) + stk.IPTables().ReplaceTable(replace.Name.String(), table) return nil } // parseMatchers parses 0 or more matchers from optVal. optVal should contain // only the matchers. -func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) { +func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) { nflog("set entries: parsing matchers of size %d", len(optVal)) - var matchers []iptables.Matcher + var matchers []stack.Matcher for len(optVal) > 0 { nflog("set entries: optVal has len %d", len(optVal)) @@ -530,7 +626,7 @@ func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Ma // parseTarget parses a target from optVal. optVal should contain only the // target. -func parseTarget(optVal []byte) (iptables.Target, error) { +func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) { nflog("set entries: parsing target of size %d", len(optVal)) if len(optVal) < linux.SizeOfXTEntryTarget { return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal)) @@ -548,7 +644,12 @@ func parseTarget(optVal []byte) (iptables.Target, error) { buf = optVal[:linux.SizeOfXTStandardTarget] binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget) - return translateToStandardTarget(standardTarget.Verdict) + if standardTarget.Verdict < 0 { + // A Verdict < 0 indicates a non-jump verdict. + return translateToStandardTarget(standardTarget.Verdict) + } + // A verdict >= 0 indicates a jump. + return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil case errorTargetName: // Error target. @@ -569,67 +670,155 @@ func parseTarget(optVal []byte) (iptables.Target, error) { switch name := errorTarget.Name.String(); name { case errorTargetName: nflog("set entries: error target") - return iptables.ErrorTarget{}, nil + return stack.ErrorTarget{}, nil default: // User defined chain. nflog("set entries: user-defined target %q", name) - return iptables.UserChainTarget{Name: name}, nil + return stack.UserChainTarget{Name: name}, nil + } + + case redirectTargetName: + // Redirect target. + if len(optVal) < linux.SizeOfXTRedirectTarget { + return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal)) + } + + if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + + var redirectTarget linux.XTRedirectTarget + buf = optVal[:linux.SizeOfXTRedirectTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget) + + // Copy linux.XTRedirectTarget to stack.RedirectTarget. + var target stack.RedirectTarget + nfRange := redirectTarget.NfRange + + // RangeSize should be 1. + if nfRange.RangeSize != 1 { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + + // TODO(gvisor.dev/issue/170): Check if the flags are valid. + // Also check if we need to map ports or IP. + // For now, redirect target only supports destination port change. + // Port range and IP range are not supported yet. + if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") } + target.RangeProtoSpecified = true + + target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:]) + target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:]) + + // TODO(gvisor.dev/issue/170): Port range is not supported yet. + if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { + return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") + } + + // Convert port from big endian to little endian. + port := make([]byte, 2) + binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort) + target.MinPort = binary.LittleEndian.Uint16(port) + + binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort) + target.MaxPort = binary.LittleEndian.Uint16(port) + return target, nil } // Unknown target. return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) } -func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) { +func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { if containsUnsupportedFields(iptip) { - return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) + return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) + } + if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) + } + if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) + } + + n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) + if n == -1 { + n = len(iptip.OutputInterface) } - return iptables.IPHeaderFilter{ - Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), + ifname := string(iptip.OutputInterface[:n]) + + n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) + if n == -1 { + n = len(iptip.OutputInterfaceMask) + } + ifnameMask := string(iptip.OutputInterfaceMask[:n]) + + return stack.IPHeaderFilter{ + Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), + Dst: tcpip.Address(iptip.Dst[:]), + DstMask: tcpip.Address(iptip.DstMask[:]), + DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, + Src: tcpip.Address(iptip.Src[:]), + SrcMask: tcpip.Address(iptip.SrcMask[:]), + SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, + OutputInterface: ifname, + OutputInterfaceMask: ifnameMask, + OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, }, nil } func containsUnsupportedFields(iptip linux.IPTIP) bool { - // Currently we check that everything except protocol is zeroed. - var emptyInetAddr = linux.InetAddr{} + // The following features are supported: + // - Protocol + // - Dst and DstMask + // - Src and SrcMask + // - The inverse destination IP check flag + // - OutputInterface, OutputInterfaceMask and its inverse. var emptyInterface = [linux.IFNAMSIZ]byte{} - return iptip.Dst != emptyInetAddr || - iptip.Src != emptyInetAddr || - iptip.SrcMask != emptyInetAddr || - iptip.DstMask != emptyInetAddr || - iptip.InputInterface != emptyInterface || - iptip.OutputInterface != emptyInterface || + // Disable any supported inverse flags. + inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) + return iptip.InputInterface != emptyInterface || iptip.InputInterfaceMask != emptyInterface || - iptip.OutputInterfaceMask != emptyInterface || iptip.Flags != 0 || - iptip.InverseFlags != 0 + iptip.InverseFlags&^inverseMask != 0 } -func validUnderflow(rule iptables.Rule) bool { +func validUnderflow(rule stack.Rule) bool { if len(rule.Matchers) != 0 { return false } + if rule.Filter != emptyFilter { + return false + } switch rule.Target.(type) { - case iptables.AcceptTarget, iptables.DropTarget: + case stack.AcceptTarget, stack.DropTarget: return true default: return false } } -func hookFromLinux(hook int) iptables.Hook { +func isUnconditionalAccept(rule stack.Rule) bool { + if !validUnderflow(rule) { + return false + } + _, ok := rule.Target.(stack.AcceptTarget) + return ok +} + +func hookFromLinux(hook int) stack.Hook { switch hook { case linux.NF_INET_PRE_ROUTING: - return iptables.Prerouting + return stack.Prerouting case linux.NF_INET_LOCAL_IN: - return iptables.Input + return stack.Input case linux.NF_INET_FORWARD: - return iptables.Forward + return stack.Forward case linux.NF_INET_LOCAL_OUT: - return iptables.Output + return stack.Output case linux.NF_INET_POST_ROUTING: - return iptables.Postrouting + return stack.Postrouting } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go new file mode 100644 index 000000000..1b4e0ad79 --- /dev/null +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -0,0 +1,149 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameOwner = "owner" + +func init() { + registerMatchMaker(ownerMarshaler{}) +} + +// ownerMarshaler implements matchMaker for owner matching. +type ownerMarshaler struct{} + +// name implements matchMaker.name. +func (ownerMarshaler) name() string { + return matcherNameOwner +} + +// marshal implements matchMaker.marshal. +func (ownerMarshaler) marshal(mr stack.Matcher) []byte { + matcher := mr.(*OwnerMatcher) + iptOwnerInfo := linux.IPTOwnerInfo{ + UID: matcher.uid, + GID: matcher.gid, + } + + // Support for UID and GID match. + if matcher.matchUID { + iptOwnerInfo.Match = linux.XT_OWNER_UID + if matcher.invertUID { + iptOwnerInfo.Invert = linux.XT_OWNER_UID + } + } + if matcher.matchGID { + iptOwnerInfo.Match |= linux.XT_OWNER_GID + if matcher.invertGID { + iptOwnerInfo.Invert |= linux.XT_OWNER_GID + } + } + + buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo) + return marshalEntryMatch(matcherNameOwner, binary.Marshal(buf, usermem.ByteOrder, iptOwnerInfo)) +} + +// unmarshal implements matchMaker.unmarshal. +func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { + if len(buf) < linux.SizeOfIPTOwnerInfo { + return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. + var matchData linux.IPTOwnerInfo + binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData) + + var owner OwnerMatcher + owner.uid = matchData.UID + owner.gid = matchData.GID + + // Check flags. + if matchData.Match&linux.XT_OWNER_UID != 0 { + owner.matchUID = true + if matchData.Invert&linux.XT_OWNER_UID != 0 { + owner.invertUID = true + } + } + if matchData.Match&linux.XT_OWNER_GID != 0 { + owner.matchGID = true + if matchData.Invert&linux.XT_OWNER_GID != 0 { + owner.invertGID = true + } + } + + return &owner, nil +} + +type OwnerMatcher struct { + uid uint32 + gid uint32 + matchUID bool + matchGID bool + invertUID bool + invertGID bool +} + +// Name implements Matcher.Name. +func (*OwnerMatcher) Name() string { + return matcherNameOwner +} + +// Match implements Matcher.Match. +func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { + // Support only for OUTPUT chain. + // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. + if hook != stack.Output { + return false, true + } + + // If the packet owner is not set, drop the packet. + if pkt.Owner == nil { + return false, true + } + + var matches bool + // Check for UID match. + if om.matchUID { + if pkt.Owner.UID() == om.uid { + matches = true + } + if matches == om.invertUID { + return false, false + } + } + + // Check for GID match. + if om.matchGID { + matches = false + if pkt.Owner.GID() == om.gid { + matches = true + } + if matches == om.invertGID { + return false, false + } + } + + return true, false +} diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go new file mode 100644 index 000000000..84abe8d29 --- /dev/null +++ b/pkg/sentry/socket/netfilter/targets.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) + +// JumpTarget implements stack.Target. +type JumpTarget struct { + // Offset is the byte offset of the rule to jump to. It is used for + // marshaling and unmarshaling. + Offset uint32 + + // RuleNum is the rule to jump to. + RuleNum int +} + +// Action implements stack.Target.Action. +func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) { + return stack.RuleJump, jt.RuleNum +} diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index f9945e214..4f98ee2d5 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -19,9 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" - "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -40,7 +39,7 @@ func (tcpMarshaler) name() string { } // marshal implements matchMaker.marshal. -func (tcpMarshaler) marshal(mr iptables.Matcher) []byte { +func (tcpMarshaler) marshal(mr stack.Matcher) []byte { matcher := mr.(*TCPMatcher) xttcp := linux.XTTCP{ SourcePortStart: matcher.sourcePortStart, @@ -53,7 +52,7 @@ func (tcpMarshaler) marshal(mr iptables.Matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { +func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTTCP { return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) } @@ -97,7 +96,7 @@ func (*TCPMatcher) Name() string { } // Match implements Matcher.Match. -func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { +func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { netHeader := header.IPv4(pkt.NetworkHeader) if netHeader.TransportProtocol() != header.TCPProtocolNumber { @@ -112,22 +111,10 @@ func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac return false, false } - // Now we need the transport header. However, this may not have been set - // yet. - // TODO(gvisor.dev/issue/170): Parsing the transport header should - // ultimately be moved into the iptables.Check codepath as matchers are - // added. - var tcpHeader header.TCP - if pkt.TransportHeader != nil { - tcpHeader = header.TCP(pkt.TransportHeader) - } else { - // The TCP header hasn't been parsed yet. We have to do it here. - if len(pkt.Data.First()) < header.TCPMinimumSize { - // There's no valid TCP header here, so we hotdrop the - // packet. - return false, true - } - tcpHeader = header.TCP(pkt.Data.First()) + tcpHeader := header.TCP(pkt.TransportHeader) + if len(tcpHeader) < header.TCPMinimumSize { + // There's no valid TCP header here, so we drop the packet immediately. + return false, true } // Check whether the source and destination ports are within the diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index 86aa11696..3f20fc891 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -19,9 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" - "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -40,7 +39,7 @@ func (udpMarshaler) name() string { } // marshal implements matchMaker.marshal. -func (udpMarshaler) marshal(mr iptables.Matcher) []byte { +func (udpMarshaler) marshal(mr stack.Matcher) []byte { matcher := mr.(*UDPMatcher) xtudp := linux.XTUDP{ SourcePortStart: matcher.sourcePortStart, @@ -53,7 +52,7 @@ func (udpMarshaler) marshal(mr iptables.Matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { +func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTUDP { return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) } @@ -94,11 +93,11 @@ func (*UDPMatcher) Name() string { } // Match implements Matcher.Match. -func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { +func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { netHeader := header.IPv4(pkt.NetworkHeader) // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved - // into the iptables.Check codepath as matchers are added. + // into the stack.Check codepath as matchers are added. if netHeader.TransportProtocol() != header.UDPProtocolNumber { return false, false } @@ -111,22 +110,10 @@ func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfac return false, false } - // Now we need the transport header. However, this may not have been set - // yet. - // TODO(gvisor.dev/issue/170): Parsing the transport header should - // ultimately be moved into the iptables.Check codepath as matchers are - // added. - var udpHeader header.UDP - if pkt.TransportHeader != nil { - udpHeader = header.UDP(pkt.TransportHeader) - } else { - // The UDP header hasn't been parsed yet. We have to do it here. - if len(pkt.Data.First()) < header.UDPMinimumSize { - // There's no valid UDP header here, so we hotdrop the - // packet. - return false, true - } - udpHeader = header.UDP(pkt.Data.First()) + udpHeader := header.UDP(pkt.TransportHeader) + if len(udpHeader) < header.UDPMinimumSize { + // There's no valid UDP header here, so we drop the packet immediately. + return false, true } // Check whether the source and destination ports are within the diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 1911cd9b8..7212d8644 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "message.go", "provider.go", + "provider_vfs2.go", "socket.go", + "socket_vfs2.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -18,6 +20,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -25,6 +28,7 @@ go_library( "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index b0dc70e5c..0d45e5053 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -67,6 +67,8 @@ func RegisterProvider(protocol int, provider Provider) { protocols[protocol] = provider } +// LINT.IfChange + // socketProvider implements socket.Provider. type socketProvider struct { } @@ -105,7 +107,10 @@ func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.Fi return nil, nil, syserr.ErrNotSupported } +// LINT.ThenChange(./provider_vfs2.go) + // init registers the socket provider. func init() { socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{}) + socket.RegisterProviderVFS2(linux.AF_NETLINK, &socketProviderVFS2{}) } diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go new file mode 100644 index 000000000..bb205be0d --- /dev/null +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -0,0 +1,69 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" +) + +// socketProviderVFS2 implements socket.Provider. +type socketProviderVFS2 struct { +} + +// Socket implements socket.Provider.Socket. +func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Netlink sockets must be specified as datagram or raw, but they + // behave the same regardless of type. + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { + return nil, syserr.ErrSocketNotSupported + } + + provider, ok := protocols[protocol] + if !ok { + return nil, syserr.ErrProtocolNotSupported + } + + p, err := provider(t) + if err != nil { + return nil, err + } + + s, err := NewVFS2(t, stype, p) + if err != nil { + return nil, err + } + + vfsfd := &s.vfsfd + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Pair implements socket.Provider.Pair by returning an error. +func (*socketProviderVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Netlink sockets never supports creating socket pairs. + return nil, nil, syserr.ErrNotSupported +} diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 2ca02567d..81f34c5a2 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -58,6 +58,8 @@ var errNoFilter = syserr.New("no filter attached", linux.ENOENT) // netlinkSocketDevice is the netlink socket virtual device. var netlinkSocketDevice = device.NewAnonDevice() +// LINT.IfChange + // Socket is the base socket type for netlink sockets. // // This implementation only supports userspace sending and receiving messages @@ -74,6 +76,14 @@ type Socket struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout // ports provides netlink port allocation. @@ -140,17 +150,19 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } return &Socket{ - ports: t.Kernel().NetlinkPorts(), - protocol: protocol, - skType: skType, - ep: ep, - connection: connection, - sendBufferSize: defaultSendBufferSize, + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, }, nil } // Release implements fs.FileOperations.Release. -func (s *Socket) Release() { +func (s *socketOpsCommon) Release() { s.connection.Release() s.ep.Close() @@ -160,7 +172,7 @@ func (s *Socket) Release() { } // Readiness implements waiter.Waitable.Readiness. -func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { // ep holds messages to be read and thus handles EventIn readiness. ready := s.ep.Readiness(mask) @@ -174,18 +186,18 @@ func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { } // EventRegister implements waiter.Waitable.EventRegister. -func (s *Socket) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.ep.EventRegister(e, mask) // Writable readiness never changes, so no registration is needed. } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *Socket) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } // Passcred implements transport.Credentialer.Passcred. -func (s *Socket) Passcred() bool { +func (s *socketOpsCommon) Passcred() bool { s.mu.Lock() passcred := s.passcred s.mu.Unlock() @@ -193,7 +205,7 @@ func (s *Socket) Passcred() bool { } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. -func (s *Socket) ConnectedPasscred() bool { +func (s *socketOpsCommon) ConnectedPasscred() bool { // This socket is connected to the kernel, which doesn't need creds. // // This is arbitrary, as ConnectedPasscred on this type has no callers. @@ -227,7 +239,7 @@ func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { // port of 0 defaults to the ThreadGroup ID. // // Preconditions: mu is held. -func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { +func (s *socketOpsCommon) bindPort(t *kernel.Task, port int32) *syserr.Error { if s.bound { // Re-binding is only allowed if the port doesn't change. if port != s.portID { @@ -251,7 +263,7 @@ func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { } // Bind implements socket.Socket.Bind. -func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err @@ -269,7 +281,7 @@ func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Connect implements socket.Socket.Connect. -func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err @@ -300,25 +312,25 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr } // Accept implements socket.Socket.Accept. -func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } // Listen implements socket.Socket.Listen. -func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { // Netlink sockets never support listen. return syserr.ErrNotSupported } // Shutdown implements socket.Socket.Shutdown. -func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { // Netlink sockets never support shutdown. return syserr.ErrNotSupported } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: switch name { @@ -369,7 +381,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem. } // SetSockOpt implements socket.Socket.SetSockOpt. -func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: switch name { @@ -466,7 +478,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy } // GetSockName implements socket.Socket.GetSockName. -func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { s.mu.Lock() defer s.mu.Unlock() @@ -478,7 +490,7 @@ func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er } // GetPeerName implements socket.Socket.GetPeerName. -func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { sa := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, // TODO(b/68878065): Support non-kernel peers. For now the peer @@ -489,7 +501,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er } // RecvMsg implements socket.Socket.RecvMsg. -func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { from := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, PortID: 0, @@ -590,7 +602,7 @@ func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) var kernelCreds = &kernelSCM{} // sendResponse sends the response messages in ms back to userspace. -func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { +func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { // Linux combines multiple netlink messages into a single datagram. bufs := make([][]byte, 0, len(ms.Messages)) for _, m := range ms.Messages { @@ -666,7 +678,7 @@ func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) { // processMessages handles each message in buf, passing it to the protocol // handler for final handling. -func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { +func (s *socketOpsCommon) processMessages(ctx context.Context, buf []byte) *syserr.Error { for len(buf) > 0 { msg, rest, ok := ParseMessage(buf) if !ok { @@ -698,7 +710,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error } // sendMsg is the core of message send, used for SendMsg and Write. -func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { dstPort := int32(0) if len(to) != 0 { @@ -745,7 +757,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, } // SendMsg implements socket.Socket.SendMsg. -func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { return s.sendMsg(t, src, to, flags, controlMessages) } @@ -756,11 +768,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, } // State implements socket.Socket.State. -func (s *Socket) State() uint32 { +func (s *socketOpsCommon) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. -func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return linux.AF_NETLINK, s.skType, s.protocol.Protocol() } + +// LINT.ThenChange(./socket_vfs2.go) diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go new file mode 100644 index 000000000..b854bf990 --- /dev/null +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -0,0 +1,138 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 is the base VFS2 socket type for netlink sockets. +// +// This implementation only supports userspace sending and receiving messages +// to/from the kernel. +// +// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ socket.SocketVFS2 = (*SocketVFS2)(nil) +var _ transport.Credentialer = (*SocketVFS2)(nil) + +// NewVFS2 creates a new SocketVFS2. +func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketVFS2, *syserr.Error) { + // Datagram endpoint used to buffer kernel -> user messages. + ep := transport.NewConnectionless(t) + + // Bind the endpoint for good measure so we can connect to it. The + // bound address will never be exposed. + if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { + ep.Close() + return nil, err + } + + // Create a connection from which the kernel can write messages. + connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) + if err != nil { + ep.Close() + return nil, err + } + + return &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, + }, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) { + // TODO(b/68878065): no ioctls supported. + return 0, syserror.ENOTTY +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &unix.EndpointReader{ + Endpoint: s.ep, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) + return int64(n), err.ToError() +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index ab01cb4fa..8f0f5466e 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "device.go", "netstack.go", + "netstack_vfs2.go", "provider.go", + "provider_vfs2.go", "save_restore.go", "stack.go", ], @@ -16,6 +18,7 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/amutex", "//pkg/binary", "//pkg/context", "//pkg/log", @@ -25,6 +28,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -32,13 +36,13 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", - "//pkg/tcpip/iptables", "//pkg/tcpip/network/ipv4", "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", @@ -46,5 +50,6 @@ go_library( "//pkg/tcpip/transport/udp", "//pkg/usermem", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 9757fbfba..e1e0c5931 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -29,10 +29,13 @@ import ( "io" "math" "reflect" + "sync/atomic" "syscall" "time" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" @@ -62,7 +65,13 @@ import ( func mustCreateMetric(name, description string) *tcpip.StatCounter { var cm tcpip.StatCounter - metric.MustRegisterCustomUint64Metric(name, false /* sync */, description, cm.Value) + metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, cm.Value) + return &cm +} + +func mustCreateGauge(name, description string) *tcpip.StatCounter { + var cm tcpip.StatCounter + metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, cm.Value) return &cm } @@ -150,10 +159,10 @@ var Metrics = tcpip.Stats{ TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), - CurrentEstablished: mustCreateMetric("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), - CurrentConnected: mustCreateMetric("/netstack/tcp/current_open", "Number of connections that are in connected state."), + CurrentEstablished: mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), + CurrentConnected: mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."), EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), - EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."), + EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."), EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), @@ -245,6 +254,8 @@ type commonEndpoint interface { GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) } +// LINT.IfChange + // SocketOperations encapsulates all the state needed to represent a network stack // endpoint in the kernel context. // @@ -256,6 +267,14 @@ type SocketOperations struct { fsutil.FileNoFsync `state:"nosave"` fsutil.FileNoMMap `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout *waiter.Queue @@ -264,6 +283,12 @@ type SocketOperations struct { skType linux.SockType protocol int + // readViewHasData is 1 iff readView has data to be read, 0 otherwise. + // Must be accessed using atomic operations. It must only be written + // with readMu held but can be read without holding readMu. The latter + // is required to avoid deadlocks in epoll Readiness checks. + readViewHasData uint32 + // readMu protects access to the below fields. readMu sync.Mutex `state:"nosave"` // readView contains the remaining payload from the last packet. @@ -293,7 +318,7 @@ type SocketOperations struct { // New creates a new endpoint socket. func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { if skType == linux.SOCK_STREAM { - if err := endpoint.SetSockOptInt(tcpip.DelayOption, 1); err != nil { + if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { return nil, syserr.TranslateNetstackError(err) } } @@ -301,11 +326,13 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue dirent := socket.NewDirent(t, netstackDevice) defer dirent.DecRef() return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ - Queue: queue, - family: family, - Endpoint: endpoint, - skType: skType, - protocol: protocol, + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, }), nil } @@ -404,33 +431,36 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { } } -func (s *SocketOperations) isPacketBased() bool { +func (s *socketOpsCommon) isPacketBased() bool { return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW } // fetchReadView updates the readView field of the socket if it's currently // empty. It assumes that the socket is locked. -func (s *SocketOperations) fetchReadView() *syserr.Error { +// +// Precondition: s.readMu must be held. +func (s *socketOpsCommon) fetchReadView() *syserr.Error { if len(s.readView) > 0 { return nil } - s.readView = nil s.sender = tcpip.FullAddress{} v, cms, err := s.Endpoint.Read(&s.sender) if err != nil { + atomic.StoreUint32(&s.readViewHasData, 0) return syserr.TranslateNetstackError(err) } s.readView = v s.readCM = cms + atomic.StoreUint32(&s.readViewHasData, 1) return nil } // Release implements fs.FileOperations.Release. -func (s *SocketOperations) Release() { +func (s *socketOpsCommon) Release() { s.Endpoint.Close() } @@ -525,11 +555,9 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } if resCh != nil { - t := ctx.(*kernel.Task) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) } @@ -598,11 +626,9 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader } if resCh != nil { - t := ctx.(*kernel.Task) - if err := t.Block(resCh); err != nil { - return 0, syserr.FromError(err).ToError() + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err } - n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{ Atomic: true, // See above. }) @@ -617,23 +643,21 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader } // Readiness returns a mask of ready events for socket s. -func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { r := s.Endpoint.Readiness(mask) // Check our cached value iff the caller asked for readability and the // endpoint itself is currently not readable. if (mask & ^r & waiter.EventIn) != 0 { - s.readMu.Lock() - if len(s.readView) > 0 { + if atomic.LoadUint32(&s.readViewHasData) == 1 { r |= waiter.EventIn } - s.readMu.Unlock() } return r } -func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error { +func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { if family == uint16(s.family) { return nil } @@ -655,8 +679,8 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error // This is a hack to work around the fact that both IPv4 and IPv6 ANY are // represented by the empty string. // -// TODO(gvisor.dev/issues/1556): remove this function. -func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { +// TODO(gvisor.dev/issue/1556): remove this function. +func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00" } @@ -665,7 +689,7 @@ func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpi // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. -func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { addr, family, err := AddressAndFamily(sockaddr) if err != nil { return err @@ -696,6 +720,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo defer s.EventUnregister(&e) if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting { + if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { + // TCP unlike UDP returns EADDRNOTAVAIL when it can't + // find an available local ephemeral port. + if err == tcpip.ErrNoPortAvailable { + return syserr.ErrAddressNotAvailable + } + } + return syserr.TranslateNetstackError(err) } @@ -711,15 +743,45 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - addr, family, err := AddressAndFamily(sockaddr) - if err != nil { - return err +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + if len(sockaddr) < 2 { + return syserr.ErrInvalidArgument } - if err := s.checkFamily(family, true /* exact */); err != nil { - return err + + family := usermem.ByteOrder.Uint16(sockaddr) + var addr tcpip.FullAddress + + // Bind for AF_PACKET requires only family, protocol and ifindex. + // In function AddressAndFamily, we check the address length which is + // not needed for AF_PACKET bind. + if family == linux.AF_PACKET { + var a linux.SockAddrLink + if len(sockaddr) < sockAddrLinkSize { + return syserr.ErrInvalidArgument + } + binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a) + + if a.Protocol != uint16(s.protocol) { + return syserr.ErrInvalidArgument + } + + addr = tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + } + } else { + var err *syserr.Error + addr, family, err = AddressAndFamily(sockaddr) + if err != nil { + return err + } + + if err = s.checkFamily(family, true /* exact */); err != nil { + return err + } + + addr = s.mapFamily(addr, family) } - addr = s.mapFamily(addr, family) // Issue the bind request to the endpoint. return syserr.TranslateNetstackError(s.Endpoint.Bind(addr)) @@ -727,13 +789,13 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Listen implements the linux syscall listen(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog)) } // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. -func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { +func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(nil) s.EventRegister(&e, waiter.EventIn) @@ -819,7 +881,7 @@ func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { // Shutdown implements the linux syscall shutdown(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := ConvertShutdown(how) if err != nil { return err @@ -902,7 +964,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -927,8 +989,15 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, return nil, syserr.ErrProtocolNotAvailable } +func boolToInt32(v bool) int32 { + if v { + return 1 + } + return 0 +} + // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -960,12 +1029,11 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return nil, syserr.ErrInvalidArgument } - var v tcpip.PasscredOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.PasscredOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.SO_SNDBUF: if outLen < sizeOfInt32 { @@ -1004,24 +1072,22 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return nil, syserr.ErrInvalidArgument } - var v tcpip.ReuseAddressOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.ReuseAddressOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.SO_REUSEPORT: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var v tcpip.ReusePortOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.ReusePortOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.SO_BINDTODEVICE: var v tcpip.BindToDeviceOption @@ -1051,24 +1117,22 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family return nil, syserr.ErrInvalidArgument } - var v tcpip.BroadcastOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.BroadcastOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.SO_KEEPALIVE: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var v tcpip.KeepaliveEnabledOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.KeepaliveEnabledOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.SO_LINGER: if outLen < linux.SizeOfLinger { @@ -1118,47 +1182,41 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return nil, syserr.ErrInvalidArgument } - v, err := ep.GetSockOptInt(tcpip.DelayOption) + v, err := ep.GetSockOptBool(tcpip.DelayOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } - - if v == 0 { - return int32(1), nil - } - return int32(0), nil + return boolToInt32(!v), nil case linux.TCP_CORK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var v tcpip.CorkOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.CorkOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.TCP_QUICKACK: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var v tcpip.QuickAckOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.QuickAckOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + return boolToInt32(v), nil case linux.TCP_MAXSEG: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var v tcpip.MaxSegOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptInt(tcpip.MaxSegOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -1272,6 +1330,29 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_SYNCNT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil + + case linux.TCP_WINDOW_CLAMP: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(v), nil default: emitUnimplementedEventTCP(t, name) } @@ -1290,11 +1371,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if err != nil { return nil, syserr.TranslateNetstackError(err) } - var o int32 - if v { - o = 1 - } - return o, nil + return boolToInt32(v), nil case linux.IPV6_PATHMTU: t.Kernel().EmitUnimplementedEvent(t) @@ -1304,8 +1381,8 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if outLen == 0 { return make([]byte, 0), nil } - var v tcpip.IPv6TrafficClassOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -1318,6 +1395,17 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf } return ib, nil + case linux.IPV6_RECVTCLASS: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + return boolToInt32(v), nil + default: emitUnimplementedEventIPv6(t, name) } @@ -1332,8 +1420,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in return nil, syserr.ErrInvalidArgument } - var v tcpip.TTLOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptInt(tcpip.TTLOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -1349,8 +1437,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in return nil, syserr.ErrInvalidArgument } - var v tcpip.MulticastTTLOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -1375,23 +1463,19 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in return nil, syserr.ErrInvalidArgument } - var v tcpip.MulticastLoopOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptBool(tcpip.MulticastLoopOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } - - if v { - return int32(1), nil - } - return int32(0), nil + return boolToInt32(v), nil case linux.IP_TOS: // Length handling for parity with Linux. if outLen == 0 { return []byte(nil), nil } - var v tcpip.IPv4TOSOption - if err := ep.GetSockOpt(&v); err != nil { + v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) + if err != nil { return nil, syserr.TranslateNetstackError(err) } if outLen < sizeOfInt32 { @@ -1408,11 +1492,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - var o int32 - if v { - o = 1 - } - return o, nil + return boolToInt32(v), nil case linux.IP_PKTINFO: if outLen < sizeOfInt32 { @@ -1423,11 +1503,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - var o int32 - if v { - o = 1 - } - return o, nil + return boolToInt32(v), nil default: emitUnimplementedEventIP(t, name) @@ -1487,7 +1563,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa // SetSockOpt can be used to implement the linux syscall setsockopt(2) for // sockets backed by a commonEndpoint. -func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { +func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: return setSockOptSocket(t, s, ep, name, optVal) @@ -1514,7 +1590,7 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n } // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. -func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { +func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { case linux.SO_SNDBUF: if len(optVal) < sizeOfInt32 { @@ -1538,7 +1614,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReuseAddressOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReuseAddressOption, v != 0)) case linux.SO_REUSEPORT: if len(optVal) < sizeOfInt32 { @@ -1546,7 +1622,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.ReusePortOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReusePortOption, v != 0)) case linux.SO_BINDTODEVICE: n := bytes.IndexByte(optVal, 0) @@ -1574,7 +1650,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BroadcastOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.BroadcastOption, v != 0)) case linux.SO_PASSCRED: if len(optVal) < sizeOfInt32 { @@ -1582,7 +1658,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.PasscredOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.PasscredOption, v != 0)) case linux.SO_KEEPALIVE: if len(optVal) < sizeOfInt32 { @@ -1590,7 +1666,7 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveEnabledOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.KeepaliveEnabledOption, v != 0)) case linux.SO_SNDTIMEO: if len(optVal) < linux.SizeOfTimeval { @@ -1662,11 +1738,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - var o int - if v == 0 { - o = 1 - } - return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.DelayOption, o)) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.DelayOption, v == 0)) case linux.TCP_CORK: if len(optVal) < sizeOfInt32 { @@ -1674,7 +1746,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.CorkOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.CorkOption, v != 0)) case linux.TCP_QUICKACK: if len(optVal) < sizeOfInt32 { @@ -1682,7 +1754,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.QuickAckOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.QuickAckOption, v != 0)) case linux.TCP_MAXSEG: if len(optVal) < sizeOfInt32 { @@ -1690,7 +1762,7 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MaxSegOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v))) case linux.TCP_KEEPIDLE: if len(optVal) < sizeOfInt32 { @@ -1750,6 +1822,22 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + case linux.TCP_SYNCNT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) + + case linux.TCP_WINDOW_CLAMP: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := usermem.ByteOrder.Uint32(optVal) + + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) @@ -1801,7 +1889,15 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) if v == -1 { v = 0 } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v))) + + case linux.IPV6_RECVTCLASS: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0)) default: emitUnimplementedEventIPv6(t, name) @@ -1878,7 +1974,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s if v < 0 || v > 255 { return syserr.ErrInvalidArgument } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastTTLOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v))) case linux.IP_ADD_MEMBERSHIP: req, err := copyInMulticastRequest(optVal, false /* allowAddr */) @@ -1925,9 +2021,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s return err } - return syserr.TranslateNetstackError(ep.SetSockOpt( - tcpip.MulticastLoopOption(v != 0), - )) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.MulticastLoopOption, v != 0)) case linux.MCAST_JOIN_GROUP: // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. @@ -1946,7 +2040,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s } else if v < 1 || v > 255 { return syserr.ErrInvalidArgument } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TTLOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v))) case linux.IP_TOS: if len(optVal) == 0 { @@ -1956,7 +2050,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s if err != nil { return err } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v))) + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v))) case linux.IP_RECVTOS: v, err := parseIntOrChar(optVal) @@ -2086,7 +2180,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) { linux.IPV6_RECVPATHMTU, linux.IPV6_RECVPKTINFO, linux.IPV6_RECVRTHDR, - linux.IPV6_RECVTCLASS, linux.IPV6_RTHDR, linux.IPV6_RTHDRDSTOPTS, linux.IPV6_TCLASS, @@ -2222,7 +2315,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2234,7 +2327,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2249,7 +2342,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, // caller. // // Precondition: s.readMu must be locked. -func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { +func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { var err *syserr.Error var copied int @@ -2281,6 +2374,10 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq } copied += n s.readView.TrimFront(n) + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) + } + dst = dst.DropFirst(n) if e != nil { err = syserr.FromError(e) @@ -2297,7 +2394,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq return 0, err } -func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { +func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) { if !s.sockOptInq { return } @@ -2312,7 +2409,7 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. -func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { isPacket := s.isPacketBased() // Fast path for regular reads from stream (e.g., TCP) endpoints. Note @@ -2327,9 +2424,9 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe // caller-supplied buffer. s.readMu.Lock() n, err := s.coalescingRead(ctx, dst, trunc) - s.readMu.Unlock() cmsg := s.controlMessages() s.fillCmsgInq(&cmsg) + s.readMu.Unlock() return n, 0, nil, 0, cmsg, err } @@ -2403,6 +2500,10 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe s.readView.TrimFront(int(n)) } + if len(s.readView) == 0 { + atomic.StoreUint32(&s.readViewHasData, 0) + } + var flags int if msgLen > int(n) { flags |= linux.MSG_TRUNC @@ -2417,13 +2518,15 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe return n, flags, addr, addrLen, cmsg, syserr.FromError(err) } -func (s *SocketOperations) controlMessages() socket.ControlMessages { +func (s *socketOpsCommon) controlMessages() socket.ControlMessages { return socket.ControlMessages{ IP: tcpip.ControlMessages{ HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp, HasTOS: s.readCM.HasTOS, TOS: s.readCM.TOS, + HasTClass: s.readCM.HasTClass, + TClass: s.readCM.TClass, HasIPPacketInfo: s.readCM.HasIPPacketInfo, PacketInfo: s.readCM.PacketInfo, }, @@ -2434,7 +2537,7 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages { // successfully writing packet data out to userspace. // // Precondition: s.readMu must be locked. -func (s *SocketOperations) updateTimestamp() { +func (s *socketOpsCommon) updateTimestamp() { // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. if !s.sockOptTimestamp { s.timestampValid = true @@ -2444,7 +2547,7 @@ func (s *SocketOperations) updateTimestamp() { // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 @@ -2512,7 +2615,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // SendMsg implements the linux syscall sendmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Reject Unix control messages. if !controlMessages.Unix.Empty() { return 0, syserr.ErrInvalidArgument @@ -2588,6 +2691,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] // Ioctl implements fs.FileOperations.Ioctl. func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, io, args) +} + +func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. @@ -2612,13 +2719,15 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, } // Add bytes removed from the endpoint but not yet sent to the caller. + s.readMu.Lock() v += len(s.readView) + s.readMu.Unlock() if v > math.MaxInt32 { v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2687,7 +2796,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc if v > math.MaxInt32 { v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2703,7 +2812,7 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc v = math.MaxInt32 } - // Copy result to user-space. + // Copy result to userspace. _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ AddressSpaceActive: true, }) @@ -2925,7 +3034,7 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { // State implements socket.Socket.State. State translates the internal state // returned by netstack to values defined by Linux. -func (s *SocketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { if s.family != linux.AF_INET && s.family != linux.AF_INET6 { // States not implemented for this socket's family. return 0 @@ -2985,6 +3094,8 @@ func (s *SocketOperations) State() uint32 { } // Type implements socket.Socket.Type. -func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.skType, s.protocol } + +// LINT.ThenChange(./netstack_vfs2.go) diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go new file mode 100644 index 000000000..fcd8013c0 --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -0,0 +1,317 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 encapsulates all the state needed to represent a network stack +// endpoint in the kernel context. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&SocketVFS2{}) + +// NewVFS2 creates a new endpoint socket. +func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { + if skType == linux.SOCK_STREAM { + if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + } + + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + + s := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) + if err == syserr.ErrWouldBlock { + return int64(n), syserror.ErrWouldBlock + } + if err != nil { + return 0, err.ToError() + } + return int64(n), nil +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + f := &ioSequencePayload{ctx: ctx, src: src} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + if err := amutex.Block(ctx, resCh); err != nil { + return 0, err + } + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) + } + + if err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + + if int64(n) < src.NumBytes() { + return int64(n), syserror.ErrWouldBlock + } + + return int64(n), nil +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, wq, terr := s.Endpoint.Accept() + if terr != nil { + if terr != tcpip.ErrWouldBlock || !blocking { + return 0, nil, 0, syserr.TranslateNetstackError(terr) + } + + var err *syserr.Error + ep, wq, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := NewVFS2(t, s.family, s.skType, s.protocol, wq, ep) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { + return 0, nil, 0, syserr.FromError(err) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer and write it to peer slice. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + + t.Kernel().RecordSocketVFS2(ns) + + return fd, addr, addrLen, syserr.FromError(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, uio, args) +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptTimestamp { + val = 1 + } + return val, nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptInq { + val = 1 + } + return val, nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) + if err != nil { + return nil, err + } + return info, nil + + case linux.IPT_SO_GET_ENTRIES: + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return entries, nil + + } + } + + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIPTReplace { + return syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal) + + case linux.IPT_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + } + } + + return SetSockOpt(t, s, s.Endpoint, level, name, optVal) +} diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index 5afff2564..ead3b2b79 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -33,6 +33,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // provider is an inet socket provider. type provider struct { family int @@ -62,10 +64,6 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in } case linux.SOCK_RAW: - // TODO(b/142504697): "In order to create a raw socket, a - // process must have the CAP_NET_RAW capability in the user - // namespace that governs its network namespace." - raw(7) - // Raw sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(ctx) if !creds.HasCapability(linux.CAP_NET_RAW) { @@ -75,6 +73,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in switch protocol { case syscall.IPPROTO_ICMP: return header.ICMPv4ProtocolNumber, true, nil + case syscall.IPPROTO_ICMPV6: + return header.ICMPv6ProtocolNumber, true, nil case syscall.IPPROTO_UDP: return header.UDPProtocolNumber, true, nil case syscall.IPPROTO_TCP: @@ -124,6 +124,12 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) } else { ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } } if e != nil { return nil, syserr.TranslateNetstackError(e) @@ -133,10 +139,6 @@ func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (* } func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { - // TODO(b/142504697): "In order to create a packet socket, a process - // must have the CAP_NET_RAW capability in the user namespace that - // governs its network namespace." - packet(7) - // Packet sockets require CAP_NET_RAW. creds := auth.CredentialsFromContext(t) if !creds.HasCapability(linux.CAP_NET_RAW) { @@ -167,6 +169,8 @@ func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol return New(t, linux.AF_PACKET, stype, protocol, wq, ep) } +// LINT.ThenChange(./provider_vfs2.go) + // Pair just returns nil sockets (not supported). func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go new file mode 100644 index 000000000..2a01143f6 --- /dev/null +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -0,0 +1,141 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/waiter" +) + +// providerVFS2 is an inet socket provider. +type providerVFS2 struct { + family int + netProto tcpip.NetworkProtocolNumber +} + +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. +func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Fail right away if we don't have a stack. + stack := t.NetworkContext() + if stack == nil { + // Don't propagate an error here. Instead, allow the socket + // code to continue searching for another provider. + return nil, nil + } + eps, ok := stack.(*Stack) + if !ok { + return nil, nil + } + + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocketVFS2(t, eps, stype, protocol) + } + + // Figure out the transport protocol. + transProto, associated, err := getTransportProtocol(t, stype, protocol) + if err != nil { + return nil, err + } + + // Create the endpoint. + var ep tcpip.Endpoint + var e *tcpip.Error + wq := &waiter.Queue{} + if stype == linux.SOCK_RAW { + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) + } else { + ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } + } + if e != nil { + return nil, syserr.TranslateNetstackError(e) + } + + return NewVFS2(t, p.family, stype, int(transProto), wq, ep) +} + +func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return NewVFS2(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + +// Pair just returns nil sockets (not supported). +func (*providerVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + return nil, nil, nil +} + +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. +func init() { + // Providers backed by netstack. + p := []providerVFS2{ + { + family: linux.AF_INET, + netProto: ipv4.ProtocolNumber, + }, + { + family: linux.AF_INET6, + netProto: ipv6.ProtocolNumber, + }, + { + family: linux.AF_PACKET, + }, + } + + for i := range p { + socket.RegisterProviderVFS2(p[i].family, &p[i]) + } +} diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 0692482e9..9b44c2b89 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" - "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" @@ -200,36 +199,66 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat interface{}, arg string) error { switch stats := stat.(type) { + case *inet.StatDev: + for _, ni := range s.Stack.NICInfo() { + if ni.Name != arg { + continue + } + // TODO(gvisor.dev/issue/2103) Support stubbed stats. + *stats = inet.StatDev{ + // Receive section. + ni.Stats.Rx.Bytes.Value(), // bytes. + ni.Stats.Rx.Packets.Value(), // packets. + 0, // errs. + 0, // drop. + 0, // fifo. + 0, // frame. + 0, // compressed. + 0, // multicast. + // Transmit section. + ni.Stats.Tx.Bytes.Value(), // bytes. + ni.Stats.Tx.Packets.Value(), // packets. + 0, // errs. + 0, // drop. + 0, // fifo. + 0, // colls. + 0, // carrier. + 0, // compressed. + } + break + } case *inet.StatSNMPIP: ip := Metrics.IP + // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPIP{ - 0, // TODO(gvisor.dev/issue/969): Support Ip/Forwarding. - 0, // TODO(gvisor.dev/issue/969): Support Ip/DefaultTTL. + 0, // Ip/Forwarding. + 0, // Ip/DefaultTTL. ip.PacketsReceived.Value(), // InReceives. - 0, // TODO(gvisor.dev/issue/969): Support Ip/InHdrErrors. + 0, // Ip/InHdrErrors. ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ForwDatagrams. - 0, // TODO(gvisor.dev/issue/969): Support Ip/InUnknownProtos. - 0, // TODO(gvisor.dev/issue/969): Support Ip/InDiscards. + 0, // Ip/ForwDatagrams. + 0, // Ip/InUnknownProtos. + 0, // Ip/InDiscards. ip.PacketsDelivered.Value(), // InDelivers. ip.PacketsSent.Value(), // OutRequests. ip.OutgoingPacketErrors.Value(), // OutDiscards. - 0, // TODO(gvisor.dev/issue/969): Support Ip/OutNoRoutes. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmTimeout. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmReqds. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmOKs. - 0, // TODO(gvisor.dev/issue/969): Support Ip/ReasmFails. - 0, // TODO(gvisor.dev/issue/969): Support Ip/FragOKs. - 0, // TODO(gvisor.dev/issue/969): Support Ip/FragFails. - 0, // TODO(gvisor.dev/issue/969): Support Ip/FragCreates. + 0, // Ip/OutNoRoutes. + 0, // Support Ip/ReasmTimeout. + 0, // Support Ip/ReasmReqds. + 0, // Support Ip/ReasmOKs. + 0, // Support Ip/ReasmFails. + 0, // Support Ip/FragOKs. + 0, // Support Ip/FragFails. + 0, // Support Ip/FragCreates. } case *inet.StatSNMPICMP: in := Metrics.ICMP.V4PacketsReceived.ICMPv4PacketStats out := Metrics.ICMP.V4PacketsSent.ICMPv4PacketStats + // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPICMP{ - 0, // TODO(gvisor.dev/issue/969): Support Icmp/InMsgs. + 0, // Icmp/InMsgs. Metrics.ICMP.V4PacketsSent.Dropped.Value(), // InErrors. - 0, // TODO(gvisor.dev/issue/969): Support Icmp/InCsumErrors. + 0, // Icmp/InCsumErrors. in.DstUnreachable.Value(), // InDestUnreachs. in.TimeExceeded.Value(), // InTimeExcds. in.ParamProblem.Value(), // InParmProbs. @@ -241,7 +270,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { in.TimestampReply.Value(), // InTimestampReps. in.InfoRequest.Value(), // InAddrMasks. in.InfoReply.Value(), // InAddrMaskReps. - 0, // TODO(gvisor.dev/issue/969): Support Icmp/OutMsgs. + 0, // Icmp/OutMsgs. Metrics.ICMP.V4PacketsReceived.Invalid.Value(), // OutErrors. out.DstUnreachable.Value(), // OutDestUnreachs. out.TimeExceeded.Value(), // OutTimeExcds. @@ -277,15 +306,16 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { } case *inet.StatSNMPUDP: udp := Metrics.UDP + // TODO(gvisor.dev/issue/969) Support stubbed stats. *stats = inet.StatSNMPUDP{ udp.PacketsReceived.Value(), // InDatagrams. udp.UnknownPortErrors.Value(), // NoPorts. - 0, // TODO(gvisor.dev/issue/969): Support Udp/InErrors. + 0, // Udp/InErrors. udp.PacketsSent.Value(), // OutDatagrams. udp.ReceiveBufferErrors.Value(), // RcvbufErrors. - 0, // TODO(gvisor.dev/issue/969): Support Udp/SndbufErrors. - 0, // TODO(gvisor.dev/issue/969): Support Udp/InCsumErrors. - 0, // TODO(gvisor.dev/issue/969): Support Udp/IgnoredMulti. + 0, // Udp/SndbufErrors. + 0, // Udp/InCsumErrors. + 0, // Udp/IgnoredMulti. } default: return syserr.ErrEndpointOperation.ToError() @@ -332,14 +362,13 @@ func (s *Stack) RouteTable() []inet.Route { } // IPTables returns the stack's iptables. -func (s *Stack) IPTables() (iptables.IPTables, error) { +func (s *Stack) IPTables() (*stack.IPTables, error) { return s.Stack.IPTables(), nil } -// FillDefaultIPTables sets the stack's iptables to the default tables, which -// allow and do not modify all traffic. -func (s *Stack) FillDefaultIPTables() { - netfilter.FillDefaultIPTables(s.Stack) +// FillIPTablesMetadata populates stack's IPTables with metadata. +func (s *Stack) FillIPTablesMetadata() { + netfilter.FillIPTablesMetadata(s.Stack) } // Resume implements inet.Stack.Resume. diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 50d9744e6..6580bd6e9 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" @@ -48,11 +49,25 @@ func (c *ControlMessages) Release() { c.Unix.Release() } -// Socket is the interface containing socket syscalls used by the syscall layer -// to redirect them to the appropriate implementation. +// Socket is an interface combining fs.FileOperations and SocketOps, +// representing a VFS1 socket file. type Socket interface { fs.FileOperations + SocketOps +} + +// SocketVFS2 is an interface combining vfs.FileDescription and SocketOps, +// representing a VFS2 socket file. +type SocketVFS2 interface { + vfs.FileDescriptionImpl + SocketOps +} +// SocketOps is the interface containing socket syscalls used by the syscall +// layer to redirect them to the appropriate implementation. +// +// It is implemented by both Socket and SocketVFS2. +type SocketOps interface { // Connect implements the connect(2) linux syscall. Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error @@ -153,6 +168,8 @@ var families = make(map[int][]Provider) // RegisterProvider registers the provider of a given address family so that // sockets of that type can be created via socket() and/or socketpair() // syscalls. +// +// This should only be called during the initialization of the address family. func RegisterProvider(family int, provider Provider) { families[family] = append(families[family], provider) } @@ -216,6 +233,74 @@ func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent { return fs.NewDirent(ctx, inode, fmt.Sprintf("socket:[%d]", ino)) } +// ProviderVFS2 is the vfs2 interface implemented by providers of sockets for +// specific address families (e.g., AF_INET). +type ProviderVFS2 interface { + // Socket creates a new socket. + // + // If a nil Socket _and_ a nil error is returned, it means that the + // protocol is not supported. A non-nil error should only be returned + // if the protocol is supported, but an error occurs during creation. + Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) + + // Pair creates a pair of connected sockets. + // + // See Socket for error information. + Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) +} + +// familiesVFS2 holds a map of all known address families and their providers. +var familiesVFS2 = make(map[int][]ProviderVFS2) + +// RegisterProviderVFS2 registers the provider of a given address family so that +// sockets of that type can be created via socket() and/or socketpair() +// syscalls. +// +// This should only be called during the initialization of the address family. +func RegisterProviderVFS2(family int, provider ProviderVFS2) { + familiesVFS2[family] = append(familiesVFS2[family], provider) +} + +// NewVFS2 creates a new socket with the given family, type and protocol. +func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + for _, p := range familiesVFS2[family] { + s, err := p.Socket(t, stype, protocol) + if err != nil { + return nil, err + } + if s != nil { + t.Kernel().RecordSocketVFS2(s) + return s, nil + } + } + + return nil, syserr.ErrAddressFamilyNotSupported +} + +// PairVFS2 creates a new connected socket pair with the given family, type and +// protocol. +func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + providers, ok := familiesVFS2[family] + if !ok { + return nil, nil, syserr.ErrAddressFamilyNotSupported + } + + for _, p := range providers { + s1, s2, err := p.Pair(t, stype, protocol) + if err != nil { + return nil, nil, err + } + if s1 != nil && s2 != nil { + k := t.Kernel() + k.RecordSocketVFS2(s1) + k.RecordSocketVFS2(s2) + return s1, s2, nil + } + } + + return nil, nil, syserr.ErrSocketNotSupported +} + // SendReceiveTimeout stores timeouts for send and receive calls. // // It is meant to be embedded into Socket implementations to help satisfy the diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 08743deba..de2cc4bdf 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -8,23 +8,27 @@ go_library( "device.go", "io.go", "unix.go", + "unix_vfs2.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 74bcd6300..c708b6030 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/ilist", + "//pkg/log", "//pkg/refs", "//pkg/sync", "//pkg/syserr", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index ce5b94ee7..09c6d3b27 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -252,7 +252,7 @@ func (e *connectionedEndpoint) Close() { // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { if ce.Type() != e.stype { - return syserr.ErrConnectionRefused + return syserr.ErrWrongProtocolForSocket } // Check if ce is e to avoid a deadlock. diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 2ef654235..2f1b127df 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" @@ -838,24 +839,43 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess // SetSockOpt sets a socket option. Currently not supported. func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error { - switch v := opt.(type) { - case tcpip.PasscredOption: - e.setPasscred(v != 0) - return nil - } return nil } func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error { + switch opt { + case tcpip.BroadcastOption: + case tcpip.PasscredOption: + e.setPasscred(v) + case tcpip.ReuseAddressOption: + default: + log.Warningf("Unsupported socket option: %d", opt) + } return nil } func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { + switch opt { + case tcpip.SendBufferSizeOption: + case tcpip.ReceiveBufferSizeOption: + default: + log.Warningf("Unsupported socket option: %d", opt) + } return nil } func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { - return false, tcpip.ErrUnknownProtocolOption + switch opt { + case tcpip.KeepaliveEnabledOption: + return false, nil + + case tcpip.PasscredOption: + return e.Passcred(), nil + + default: + log.Warningf("Unsupported socket option: %d", opt) + return false, tcpip.ErrUnknownProtocolOption + } } func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { @@ -914,29 +934,19 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { return int(v), nil default: + log.Warningf("Unsupported socket option: %d", opt) return -1, tcpip.ErrUnknownProtocolOption } } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { - switch o := opt.(type) { + switch opt.(type) { case tcpip.ErrorOption: return nil - case *tcpip.PasscredOption: - if e.Passcred() { - *o = tcpip.PasscredOption(1) - } else { - *o = tcpip.PasscredOption(0) - } - return nil - - case *tcpip.KeepaliveEnabledOption: - *o = 0 - return nil - default: + log.Warningf("Unsupported socket option: %T", opt) return tcpip.ErrUnknownProtocolOption } } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 4d30aa714..c4c9db81b 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -33,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -52,11 +54,8 @@ type SocketOperations struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` - refs.AtomicRefCount - socket.SendReceiveTimeout - ep transport.Endpoint - stype linux.SockType + socketOpsCommon } // New creates a new unix socket. @@ -75,29 +74,42 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty } s := SocketOperations{ - ep: ep, - stype: stype, + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, } s.EnableLeakCheck("unix.SocketOperations") return fs.NewFile(ctx, d, flags, &s) } +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + refs.AtomicRefCount + socket.SendReceiveTimeout + + ep transport.Endpoint + stype linux.SockType +} + // DecRef implements RefCounter.DecRef. -func (s *SocketOperations) DecRef() { +func (s *socketOpsCommon) DecRef() { s.DecRefWithDestructor(func() { s.ep.Close() }) } // Release implemements fs.FileOperations.Release. -func (s *SocketOperations) Release() { +func (s *socketOpsCommon) Release() { // Release only decrements a reference on s because s may be referenced in // the abstract socket namespace. s.DecRef() } -func (s *SocketOperations) isPacket() bool { +func (s *socketOpsCommon) isPacket() bool { switch s.stype { case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: return true @@ -110,7 +122,7 @@ func (s *SocketOperations) isPacket() bool { } // Endpoint extracts the transport.Endpoint. -func (s *SocketOperations) Endpoint() transport.Endpoint { +func (s *socketOpsCommon) Endpoint() transport.Endpoint { return s.ep } @@ -143,7 +155,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) { // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -155,7 +167,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, // GetSockName implements the linux syscall getsockname(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -178,7 +190,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // Listen implements the linux syscall listen(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return s.ep.Listen(backlog) } @@ -310,6 +322,11 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Create the socket. + // + // Note that the file permissions here are not set correctly (see + // gvisor.dev/issue/2324). There is no convenient way to get permissions + // on the socket referred to by s, so we will leave this discrepancy + // unresolved until VFS2 replaces this code. childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}) if err != nil { return syserr.ErrPortInUse @@ -345,6 +362,31 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, return ep, nil } + if kernel.VFS2Enabled { + p := fspath.Parse(path) + root := t.FSContext().RootDirectoryVFS2() + start := root + relPath := !p.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: p, + FollowFinalSymlink: true, + } + ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) + root.DecRef() + if relPath { + start.DecRef() + } + if e != nil { + return nil, syserr.FromError(e) + } + return ep, nil + } + // Find the node in the filesystem. root := t.FSContext().RootDirectory() cwd := t.FSContext().WorkingDirectory() @@ -363,12 +405,11 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, // No socket! return nil, syserr.ErrConnectionRefused } - return ep, nil } // Connect implements the linux syscall connect(2) for unix sockets. -func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { ep, err := extractEndpoint(t, sockaddr) if err != nil { return err @@ -376,10 +417,21 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo defer ep.Release() // Connect the server endpoint. - return s.ep.Connect(t, ep) + err = s.ep.Connect(t, ep) + + if err == syserr.ErrWrongProtocolForSocket { + // Linux for abstract sockets returns ErrConnectionRefused + // instead of ErrWrongProtocolForSocket. + path, _ := extractPath(sockaddr) + if len(path) > 0 && path[0] == 0 { + err = syserr.ErrConnectionRefused + } + } + + return err } -// Writev implements fs.FileOperations.Write. +// Write implements fs.FileOperations.Write. func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { t := kernel.TaskFromContext(ctx) ctrl := control.New(t, s.ep, nil) @@ -399,7 +451,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by // a transport.Endpoint. -func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { w := EndpointWriter{ Ctx: t, Endpoint: s.ep, @@ -453,27 +505,27 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } // Passcred implements transport.Credentialer.Passcred. -func (s *SocketOperations) Passcred() bool { +func (s *socketOpsCommon) Passcred() bool { return s.ep.Passcred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. -func (s *SocketOperations) ConnectedPasscred() bool { +func (s *socketOpsCommon) ConnectedPasscred() bool { return s.ep.ConnectedPasscred() } // Readiness implements waiter.Waitable.Readiness. -func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return s.ep.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. -func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.ep.EventRegister(e, mask) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *SocketOperations) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } @@ -485,7 +537,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa // Shutdown implements the linux syscall shutdown(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := netstack.ConvertShutdown(how) if err != nil { return err @@ -511,7 +563,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 @@ -648,12 +700,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } // State implements socket.Socket.State. -func (s *SocketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. -func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { // Unix domain sockets always have a protocol of 0. return linux.AF_UNIX, s.stype, 0 } @@ -706,4 +758,5 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F func init() { socket.RegisterProvider(linux.AF_UNIX, &provider{}) + socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{}) } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go new file mode 100644 index 000000000..45e109361 --- /dev/null +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -0,0 +1,358 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unix + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 implements socket.SocketVFS2 (and by extension, +// vfs.FileDescriptionImpl) for Unix sockets. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&SocketVFS2{}) + +// NewSockfsFile creates a new socket file in the global sockfs mount and +// returns a corresponding file description. +func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + d := sockfs.NewDentry(t.Credentials(), mnt) + + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) + if err != nil { + return nil, syserr.FromError(err) + } + return fd, nil +} + +// NewFileDescription creates and returns a socket file description +// corresponding to the given mount and dentry. +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) { + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + + sock := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, + } + vfsfd := &sock.vfsfd + if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return vfsfd, nil +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +} + +// blockingAccept implements a blocking version of accept(2), that is, if no +// connections are ready to be accept, it will block until one becomes ready. +func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.socketOpsCommon.EventRegister(&e, waiter.EventIn) + defer s.socketOpsCommon.EventUnregister(&e) + + // Try to accept the connection; if it fails, then wait until we get a + // notification. + for { + if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + return ep, err + } + + if err := t.Block(ch); err != nil { + return nil, syserr.FromError(err) + } + } +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, err := s.ep.Accept() + if err != nil { + if err != syserr.ErrWouldBlock || !blocking { + return 0, nil, 0, err + } + + var err *syserr.Error + ep, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := NewSockfsFile(t, ep, s.stype) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if flags&linux.SOCK_NONBLOCK != 0 { + ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + if e != nil { + return 0, nil, 0, syserr.FromError(e) + } + + t.Kernel().RecordSocketVFS2(ns) + return fd, addr, addrLen, nil +} + +// Bind implements the linux syscall bind(2) for unix sockets. +func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + p, e := extractPath(sockaddr) + if e != nil { + return e + } + + bep, ok := s.ep.(transport.BoundEndpoint) + if !ok { + // This socket can't be bound. + return syserr.ErrInvalidArgument + } + + return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error { + // Is it abstract? + if p[0] == 0 { + if t.IsNetworkNamespaced() { + return syserr.ErrInvalidEndpointState + } + if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + // syserr.ErrPortInUse corresponds to EADDRINUSE. + return syserr.ErrPortInUse + } + } else { + path := fspath.Parse(p) + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + relPath := !path.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + } + stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE}) + if err != nil { + return syserr.FromError(err) + } + err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ + // File permissions correspond to net/unix/af_unix.c:unix_bind. + Mode: linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()), + Endpoint: bep, + }) + if err == syserror.EEXIST { + return syserr.ErrAddressInUse + } + return syserr.FromError(err) + } + + return nil + }) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return netstack.Ioctl(ctx, s.ep, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &EndpointReader{ + Ctx: ctx, + Endpoint: s.ep, + NumRights: 0, + Peek: false, + From: nil, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + t := kernel.TaskFromContext(ctx) + ctrl := control.New(t, s.ep, nil) + + if src.NumBytes() == 0 { + nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil) + return int64(nInt), err.ToError() + } + + return src.CopyInTo(ctx, &EndpointWriter{ + Ctx: ctx, + Endpoint: s.ep, + Control: ctrl, + To: nil, + }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) +} + +// providerVFS2 is a unix domain socket provider for VFS2. +type providerVFS2 struct{} + +func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, syserr.ErrProtocolNotSupported + } + + // Create the endpoint and socket. + var ep transport.Endpoint + switch stype { + case linux.SOCK_DGRAM, linux.SOCK_RAW: + ep = transport.NewConnectionless(t) + case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: + ep = transport.NewConnectioned(t, stype, t.Kernel()) + default: + return nil, syserr.ErrInvalidArgument + } + + f, err := NewSockfsFile(t, ep, stype) + if err != nil { + ep.Close() + return nil, err + } + return f, nil +} + +// Pair creates a new pair of AF_UNIX connected sockets. +func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, nil, syserr.ErrProtocolNotSupported + } + + switch stype { + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: + // Ok + default: + return nil, nil, syserr.ErrInvalidArgument + } + + // Create the endpoints and sockets. + ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) + s1, err := NewSockfsFile(t, ep1, stype) + if err != nil { + ep1.Close() + ep2.Close() + return nil, nil, err + } + s2, err := NewSockfsFile(t, ep2, stype) + if err != nil { + s1.DecRef() + ep2.Close() + return nil, nil, err + } + + return s1, s2, nil +} |