diff options
Diffstat (limited to 'pkg/sentry')
84 files changed, 1943 insertions, 1353 deletions
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD index ce4f1e42c..d17b1bdcf 100644 --- a/pkg/sentry/context/contexttest/BUILD +++ b/pkg/sentry/context/contexttest/BUILD @@ -9,11 +9,11 @@ go_library( importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest", visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/memutil", "//pkg/sentry/context", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", - "//pkg/sentry/memutil", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/ptrace", diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go index 210a235d2..83da40711 100644 --- a/pkg/sentry/context/contexttest/contexttest.go +++ b/pkg/sentry/context/contexttest/contexttest.go @@ -21,11 +21,11 @@ import ( "testing" "time" + "gvisor.googlesource.com/gvisor/pkg/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index c0bc261a2..a0a35c242 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -805,7 +805,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans var childDir *Dirent err := d.genericCreate(ctx, root, name, func() error { var e error - childDir, e = d.Inode.Bind(ctx, name, data, perms) + childDir, e = d.Inode.Bind(ctx, d, name, data, perms) if e != nil { return e } diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 8c1307235..f64954457 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -545,12 +545,28 @@ type lockedWriter struct { // Write implements io.Writer.Write. func (w *lockedWriter) Write(buf []byte) (int, error) { - n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), w.File.offset) - return int(n), err + return w.WriteAt(buf, w.File.offset) } // WriteAt implements io.Writer.WriteAt. func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) { - n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), offset) - return int(n), err + var ( + written int + err error + ) + // The io.Writer contract requires that Write writes all available + // bytes and does not return short writes. This causes errors with + // io.Copy, since our own Write interface does not have this same + // contract. Enforce that here. + for written < len(buf) { + var n int64 + n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written)) + if n > 0 { + written += int(n) + } + if err != nil { + break + } + } + return written, err } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index cbd5b9a84..7ac0a421f 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -15,6 +15,7 @@ package gofer import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/p9" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" @@ -61,13 +62,13 @@ type endpoint struct { path string } -func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) { +func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { switch t { - case transport.SockStream: + case linux.SOCK_STREAM: return p9.StreamSocket, true - case transport.SockSeqpacket: + case linux.SOCK_SEQPACKET: return p9.SeqpacketSocket, true - case transport.SockDgram: + case linux.SOCK_DGRAM: return p9.DgramSocket, true } return 0, false @@ -75,7 +76,7 @@ func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) { // BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { - cf, ok := unixSockToP9(ce.Type()) + cf, ok := sockTypeToP9(ce.Type()) if !ok { return syserr.ErrConnectionRefused } @@ -139,3 +140,8 @@ func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *syserr func (e *endpoint) Release() { e.inode.DecRef() } + +// Passcred implements transport.BoundEndpoint.Passcred. +func (e *endpoint) Passcred() bool { + return false +} diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 3ed137006..305eea718 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -15,9 +15,11 @@ package host import ( + "fmt" "sync" "syscall" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/fd" "gvisor.googlesource.com/gvisor/pkg/fdnotifier" "gvisor.googlesource.com/gvisor/pkg/log" @@ -51,25 +53,11 @@ type ConnectedEndpoint struct { // ref keeps track of references to a connectedEndpoint. ref refs.AtomicRefCount - // mu protects fd, readClosed and writeClosed. - mu sync.RWMutex `state:"nosave"` - - // file is an *fd.FD containing the FD backing this endpoint. It must be - // set to nil if it has been closed. - file *fd.FD `state:"nosave"` - - // readClosed is true if the FD has read shutdown or if it has been closed. - readClosed bool - - // writeClosed is true if the FD has write shutdown or if it has been - // closed. - writeClosed bool - // If srfd >= 0, it is the host FD that file was imported from. srfd int `state:"wait"` // stype is the type of Unix socket. - stype transport.SockType + stype linux.SockType // sndbuf is the size of the send buffer. // @@ -78,6 +66,13 @@ type ConnectedEndpoint struct { // prevent lots of small messages from filling the real send buffer // size on the host. sndbuf int `state:"nosave"` + + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // file is an *fd.FD containing the FD backing this endpoint. It must be + // set to nil if it has been closed. + file *fd.FD `state:"nosave"` } // init performs initialization required for creating new ConnectedEndpoints and @@ -111,7 +106,7 @@ func (c *ConnectedEndpoint) init() *syserr.Error { return syserr.ErrInvalidEndpointState } - c.stype = transport.SockType(stype) + c.stype = linux.SockType(stype) c.sndbuf = sndbuf return nil @@ -169,7 +164,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) - return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil + return unixsocket.NewWithDirent(ctx, d, ep, e.stype, flags), nil } // newSocket allocates a new unix socket with host endpoint. @@ -201,16 +196,13 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) - return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil + return unixsocket.New(ctx, ep, e.stype), nil } // Send implements transport.ConnectedEndpoint.Send. func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() - if c.writeClosed { - return 0, false, syserr.ErrClosedForSend - } if !controlMessages.Empty() { return 0, false, syserr.ErrInvalidEndpointState @@ -218,7 +210,7 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.Contro // Since stream sockets don't preserve message boundaries, we can write // only as much of the message as fits in the send buffer. - truncate := c.stype == transport.SockStream + truncate := c.stype == linux.SOCK_STREAM n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate) if n < totalLen && err == nil { @@ -244,8 +236,13 @@ func (c *ConnectedEndpoint) SendNotify() {} // CloseSend implements transport.ConnectedEndpoint.CloseSend. func (c *ConnectedEndpoint) CloseSend() { c.mu.Lock() - c.writeClosed = true - c.mu.Unlock() + defer c.mu.Unlock() + + if err := syscall.Shutdown(c.file.FD(), syscall.SHUT_WR); err != nil { + // A well-formed UDS shutdown can't fail. See + // net/unix/af_unix.c:unix_shutdown. + panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err)) + } } // CloseNotify implements transport.ConnectedEndpoint.CloseNotify. @@ -255,9 +252,7 @@ func (c *ConnectedEndpoint) CloseNotify() {} func (c *ConnectedEndpoint) Writable() bool { c.mu.RLock() defer c.mu.RUnlock() - if c.writeClosed { - return true - } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0 } @@ -285,9 +280,6 @@ func (c *ConnectedEndpoint) EventUpdate() { func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() - if c.readClosed { - return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive - } var cm unet.ControlMessage if numRights > 0 { @@ -344,31 +336,34 @@ func (c *ConnectedEndpoint) RecvNotify() {} // CloseRecv implements transport.Receiver.CloseRecv. func (c *ConnectedEndpoint) CloseRecv() { c.mu.Lock() - c.readClosed = true - c.mu.Unlock() + defer c.mu.Unlock() + + if err := syscall.Shutdown(c.file.FD(), syscall.SHUT_RD); err != nil { + // A well-formed UDS shutdown can't fail. See + // net/unix/af_unix.c:unix_shutdown. + panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err)) + } } // Readable implements transport.Receiver.Readable. func (c *ConnectedEndpoint) Readable() bool { c.mu.RLock() defer c.mu.RUnlock() - if c.readClosed { - return true - } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0 } // SendQueuedSize implements transport.Receiver.SendQueuedSize. func (c *ConnectedEndpoint) SendQueuedSize() int64 { - // SendQueuedSize isn't supported for host sockets because we don't allow the - // sentry to call ioctl(2). + // TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host + // sockets because we don't allow the sentry to call ioctl(2). return -1 } // RecvQueuedSize implements transport.Receiver.RecvQueuedSize. func (c *ConnectedEndpoint) RecvQueuedSize() int64 { - // RecvQueuedSize isn't supported for host sockets because we don't allow the - // sentry to call ioctl(2). + // TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host + // sockets because we don't allow the sentry to call ioctl(2). return -1 } diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index 06392a65a..bc3ce5627 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -198,20 +198,6 @@ func TestListen(t *testing.T) { } } -func TestSend(t *testing.T) { - e := ConnectedEndpoint{writeClosed: true} - if _, _, err := e.Send(nil, transport.ControlMessages{}, tcpip.FullAddress{}); err != syserr.ErrClosedForSend { - t.Errorf("Got %#v.Send() = %v, want = %v", e, err, syserr.ErrClosedForSend) - } -} - -func TestRecv(t *testing.T) { - e := ConnectedEndpoint{readClosed: true} - if _, _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != syserr.ErrClosedForReceive { - t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, syserr.ErrClosedForReceive) - } -} - func TestPasscred(t *testing.T) { e := ConnectedEndpoint{} if got, want := e.Passcred(), false; got != want { @@ -244,20 +230,6 @@ func TestQueuedSize(t *testing.T) { } } -func TestReadable(t *testing.T) { - e := ConnectedEndpoint{readClosed: true} - if got, want := e.Readable(), true; got != want { - t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want) - } -} - -func TestWritable(t *testing.T) { - e := ConnectedEndpoint{writeClosed: true} - if got, want := e.Writable(), true; got != want { - t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want) - } -} - func TestRelease(t *testing.T) { f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) if err != nil { @@ -272,131 +244,3 @@ func TestRelease(t *testing.T) { t.Errorf("got = %#v, want = %#v", c, want) } } - -func TestClose(t *testing.T) { - type testCase struct { - name string - cep *ConnectedEndpoint - addFD bool - f func() - want *ConnectedEndpoint - } - - var tests []testCase - - // nil is the value used by ConnectedEndpoint to indicate a closed file. - // Non-nil files are used to check if the file gets closed. - - f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} - tests = append(tests, testCase{ - name: "First CloseRecv", - cep: c, - addFD: false, - f: c.CloseRecv, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true} - tests = append(tests, testCase{ - name: "Second CloseRecv", - cep: c, - addFD: false, - f: c.CloseRecv, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} - tests = append(tests, testCase{ - name: "First CloseSend", - cep: c, - addFD: false, - f: c.CloseSend, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true} - tests = append(tests, testCase{ - name: "Second CloseSend", - cep: c, - addFD: false, - f: c.CloseSend, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true} - tests = append(tests, testCase{ - name: "CloseSend then CloseRecv", - cep: c, - addFD: true, - f: c.CloseRecv, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true} - tests = append(tests, testCase{ - name: "CloseRecv then CloseSend", - cep: c, - addFD: true, - f: c.CloseSend, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true} - tests = append(tests, testCase{ - name: "Full close then CloseRecv", - cep: c, - addFD: false, - f: c.CloseRecv, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, - }) - - f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) - if err != nil { - t.Fatal("Creating socket:", err) - } - c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true} - tests = append(tests, testCase{ - name: "Full close then CloseSend", - cep: c, - addFD: false, - f: c.CloseSend, - want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, - }) - - for _, test := range tests { - if test.addFD { - fdnotifier.AddFD(int32(test.cep.file.FD()), nil) - } - if test.f(); !reflect.DeepEqual(test.cep, test.want) { - t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want) - } - } -} diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index aef1a1cb9..0b54c2e77 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -220,9 +220,9 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, } // Bind calls i.InodeOperations.Bind with i as the directory. -func (i *Inode) Bind(ctx context.Context, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { +func (i *Inode) Bind(ctx context.Context, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { if i.overlay != nil { - return overlayBind(ctx, i.overlay, name, data, perm) + return overlayBind(ctx, i.overlay, parent, name, data, perm) } return i.InodeOperations.Bind(ctx, i, name, data, perm) } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index cdffe173b..06506fb20 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -398,14 +398,14 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena return nil } -func overlayBind(ctx context.Context, o *overlayEntry, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { +func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) { + if err := copyUp(ctx, parent); err != nil { + return nil, err + } + o.copyMu.RLock() defer o.copyMu.RUnlock() - // We do not support doing anything exciting with sockets unless there - // is already a directory in the upper filesystem. - if o.upper == nil { - return nil, syserror.EOPNOTSUPP - } + d, err := o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm) if err != nil { return nil, err diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index d19c360e0..1728fe0b5 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -45,6 +45,7 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", + "//pkg/sentry/socket", "//pkg/sentry/socket/rpcinet", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go index 379569823..986bc0a45 100644 --- a/pkg/sentry/fs/proc/inode.go +++ b/pkg/sentry/fs/proc/inode.go @@ -21,11 +21,14 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) // taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr -// method to return the task as the owner. +// method to return either the task or root as the owner, depending on the +// task's dumpability. // // +stateify savable type taskOwnedInodeOps struct { @@ -41,9 +44,42 @@ func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) ( if err != nil { return fs.UnstableAttr{}, err } - // Set the task owner as the file owner. + + // By default, set the task owner as the file owner. creds := i.t.Credentials() uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID} + + // Linux doesn't apply dumpability adjustments to world + // readable/executable directories so that applications can stat + // /proc/PID to determine the effective UID of a process. See + // fs/proc/base.c:task_dump_owner. + if fs.IsDir(inode.StableAttr) && uattr.Perms == fs.FilePermsFromMode(0555) { + return uattr, nil + } + + // If the task is not dumpable, then root (in the namespace preferred) + // owns the file. + var m *mm.MemoryManager + i.t.WithMuLocked(func(t *kernel.Task) { + m = t.MemoryManager() + }) + + if m == nil { + uattr.Owner.UID = auth.RootKUID + uattr.Owner.GID = auth.RootKGID + } else if m.Dumpability() != mm.UserDumpable { + if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() { + uattr.Owner.UID = kuid + } else { + uattr.Owner.UID = auth.RootKUID + } + if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() { + uattr.Owner.GID = kgid + } else { + uattr.Owner.GID = auth.RootKGID + } + } + return uattr, nil } diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 4a107c739..034950158 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -27,6 +27,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" "gvisor.googlesource.com/gvisor/pkg/sentry/inet" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" ) @@ -213,17 +214,18 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s fmt.Fprintf(&buf, "Num RefCount Protocol Flags Type St Inode Path\n") // Entries - for _, sref := range n.k.ListSockets(linux.AF_UNIX) { - s := sref.Get() + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", sref) + log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) continue } sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(*unix.SocketOperations) - if !ok { - panic(fmt.Sprintf("Found non-unix socket file in unix socket table: %+v", sfile)) + if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + // Not a unix socket. + continue } + sops := sfile.FileOperations.(*unix.SocketOperations) addr, err := sops.Endpoint().GetLocalAddress() if err != nil { @@ -240,24 +242,6 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } } - var sockState int - switch sops.Endpoint().Type() { - case linux.SOCK_DGRAM: - sockState = linux.SS_CONNECTING - // Unlike Linux, we don't have unbound connection-less sockets, - // so no SS_DISCONNECTING. - - case linux.SOCK_SEQPACKET: - fallthrough - case linux.SOCK_STREAM: - // Connectioned. - if sops.Endpoint().(transport.ConnectingEndpoint).Connected() { - sockState = linux.SS_CONNECTED - } else { - sockState = linux.SS_UNCONNECTED - } - } - // In the socket entry below, the value for the 'Num' field requires // some consideration. Linux prints the address to the struct // unix_sock representing a socket in the kernel, but may redact the @@ -282,7 +266,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. - sockState, // State. + sops.State(), // State. sfile.InodeID(), // Inode. ) diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 77e03d349..21a965f90 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -96,7 +96,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks boo contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) } - // TODO(b/31916171): Set EUID/EGID based on dumpability. + // N.B. taskOwnedInodeOps enforces dumpability-based ownership. d := &taskDir{ Dir: *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)), t: t, @@ -667,6 +667,21 @@ func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { return newProcInode(c, msrc, fs.SpecialFile, t) } +// Check implements fs.InodeOperations.Check. +func (c *comm) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + // This file can always be read or written by members of the same + // thread group. See fs/proc/base.c:proc_tid_comm_permission. + // + // N.B. This check is currently a no-op as we don't yet support writing + // and this file is world-readable anyways. + t := kernel.TaskFromContext(ctx) + if t != nil && t.ThreadGroup() == c.t.ThreadGroup() && !p.Execute { + return true + } + + return fs.ContextCanAccessFile(ctx, inode, p) +} + // GetFile implements fs.InodeOperations.GetFile. func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { return fs.NewFile(ctx, dirent, flags, &commFile{t: c.t}), nil diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index bce5f091d..c1721f434 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -54,6 +54,8 @@ type TimerOperations struct { // NewFile returns a timerfd File that receives time from c. func NewFile(ctx context.Context, c ktime.Clock) *fs.File { dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[timerfd]") + // Release the initial dirent reference after NewFile takes a reference. + defer dirent.DecRef() tops := &TimerOperations{} tops.timer = ktime.NewTimer(c, tops) // Timerfds reject writes, but the Write flag must be set in order to diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index b7c29a4d1..83e1bf247 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -34,6 +34,16 @@ const ( // GID for the root directory. rootGIDKey = "gid" + // cacheKey sets the caching policy for the mount. + cacheKey = "cache" + + // cacheAll uses the virtual file system cache for everything (default). + cacheAll = "cache" + + // cacheRevalidate allows dirents to be cached, but revalidates them on each + // lookup. + cacheRevalidate = "revalidate" + // TODO(edahlgren/mpratt): support a tmpfs size limit. // size = "size" @@ -122,15 +132,24 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou delete(options, rootGIDKey) } + // Construct a mount which will follow the cache options provided. + var msrc *fs.MountSource + switch options[cacheKey] { + case "", cacheAll: + msrc = fs.NewCachingMountSource(f, flags) + case cacheRevalidate: + msrc = fs.NewRevalidatingMountSource(f, flags) + default: + return nil, fmt.Errorf("invalid cache policy option %q", options[cacheKey]) + } + delete(options, cacheKey) + // Fail if the caller passed us more options than we can parse. They may be // expecting us to set something we can't set. if len(options) > 0 { return nil, fmt.Errorf("unsupported mount options: %v", options) } - // Construct a mount which will cache dirents. - msrc := fs.NewCachingMountSource(f, flags) - // Construct the tmpfs root. return NewDir(ctx, nil, owner, perms, msrc), nil } diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD new file mode 100644 index 000000000..1a4632a54 --- /dev/null +++ b/pkg/sentry/hostmm/BUILD @@ -0,0 +1,18 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "hostmm", + srcs = [ + "cgroup.go", + "hostmm.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/fd", + "//pkg/log", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/hostmm/cgroup.go b/pkg/sentry/hostmm/cgroup.go new file mode 100644 index 000000000..e5cc26ab2 --- /dev/null +++ b/pkg/sentry/hostmm/cgroup.go @@ -0,0 +1,111 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostmm + +import ( + "bufio" + "fmt" + "os" + "path" + "strings" +) + +// currentCgroupDirectory returns the directory for the cgroup for the given +// controller in which the calling process resides. +func currentCgroupDirectory(ctrl string) (string, error) { + root, err := cgroupRootDirectory(ctrl) + if err != nil { + return "", err + } + cg, err := currentCgroup(ctrl) + if err != nil { + return "", err + } + return path.Join(root, cg), nil +} + +// cgroupRootDirectory returns the root directory for the cgroup hierarchy in +// which the given cgroup controller is mounted in the calling process' mount +// namespace. +func cgroupRootDirectory(ctrl string) (string, error) { + const path = "/proc/self/mounts" + file, err := os.Open(path) + if err != nil { + return "", err + } + defer file.Close() + + // Per proc(5) -> fstab(5): + // Each line of /proc/self/mounts describes a mount. + scanner := bufio.NewScanner(file) + for scanner.Scan() { + // Each line consists of 6 space-separated fields. Find the line for + // which the third field (fs_vfstype) is cgroup, and the fourth field + // (fs_mntops, a comma-separated list of mount options) contains + // ctrl. + var spec, file, vfstype, mntopts, freq, passno string + const nrfields = 6 + line := scanner.Text() + n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno) + if err != nil { + return "", fmt.Errorf("failed to parse %s: %v", path, err) + } + if n != nrfields { + return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields) + } + if vfstype != "cgroup" { + continue + } + for _, mntopt := range strings.Split(mntopts, ",") { + if mntopt == ctrl { + return file, nil + } + } + } + return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl) +} + +// currentCgroup returns the cgroup for the given controller in which the +// calling process resides. The returned string is a path that should be +// interpreted as relative to cgroupRootDirectory(ctrl). +func currentCgroup(ctrl string) (string, error) { + const path = "/proc/self/cgroup" + file, err := os.Open(path) + if err != nil { + return "", err + } + defer file.Close() + + // Per proc(5) -> cgroups(7): + // Each line of /proc/self/cgroups describes a cgroup hierarchy. + scanner := bufio.NewScanner(file) + for scanner.Scan() { + // Each line consists of 3 colon-separated fields. Find the line for + // which the second field (controller-list, a comma-separated list of + // cgroup controllers) contains ctrl. + line := scanner.Text() + const nrfields = 3 + fields := strings.Split(line, ":") + if len(fields) != nrfields { + return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields) + } + for _, controller := range strings.Split(fields[1], ",") { + if controller == ctrl { + return fields[2], nil + } + } + } + return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl) +} diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go new file mode 100644 index 000000000..5432cada9 --- /dev/null +++ b/pkg/sentry/hostmm/hostmm.go @@ -0,0 +1,130 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostmm provides tools for interacting with the host Linux kernel's +// virtual memory management subsystem. +package hostmm + +import ( + "fmt" + "os" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NotifyCurrentMemcgPressureCallback requests that f is called whenever the +// calling process' memory cgroup indicates memory pressure of the given level, +// as specified by Linux's Documentation/cgroup-v1/memory.txt. +// +// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that +// terminates the requested memory pressure notifications. This function may be +// called at most once. +func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) { + cgdir, err := currentCgroupDirectory("memory") + if err != nil { + return nil, err + } + + pressurePath := path.Join(cgdir, "memory.pressure_level") + pressureFile, err := os.Open(pressurePath) + if err != nil { + return nil, err + } + defer pressureFile.Close() + + eventControlPath := path.Join(cgdir, "cgroup.event_control") + eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0) + if err != nil { + return nil, err + } + defer eventControlFile.Close() + + eventFD, err := newEventFD() + if err != nil { + return nil, err + } + + // Don't use fmt.Fprintf since the whole string needs to be written in a + // single syscall. + eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level) + if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil { + eventFD.Close() + return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr)) + } + + log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level) + const sizeofUint64 = 8 + // The most significant bit of the eventfd value is set by the stop + // function, which is practically unambiguous since it's not plausible for + // 2**63 pressure events to occur between eventfd reads. + const stopVal = 1 << 63 + stopCh := make(chan struct{}) + go func() { // S/R-SAFE: f provides synchronization if necessary + rw := fd.NewReadWriter(eventFD.FD()) + var buf [sizeofUint64]byte + for { + n, err := rw.Read(buf[:]) + if err != nil { + if err == syscall.EINTR { + continue + } + panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err)) + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + val := usermem.ByteOrder.Uint64(buf[:]) + if val >= stopVal { + // Assume this was due to the notifier's "destructor" (the + // function returned by NotifyCurrentMemcgPressureCallback + // below) being called. + eventFD.Close() + close(stopCh) + return + } + f() + } + }() + return func() { + rw := fd.NewReadWriter(eventFD.FD()) + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], stopVal) + for { + n, err := rw.Write(buf[:]) + if err != nil { + if err == syscall.EINTR { + continue + } + panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err)) + } + if n != sizeofUint64 { + panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64)) + } + break + } + <-stopCh + }, nil +} + +func newEventFD() (*fd.FD, error) { + f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0) + if e != 0 { + return nil, fmt.Errorf("failed to create eventfd: %v", e) + } + return fd.New(int(f)), nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 99a2fd964..04e375910 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -64,6 +64,18 @@ go_template_instance( }, ) +go_template_instance( + name = "socket_list", + out = "socket_list.go", + package = "kernel", + prefix = "socket", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*SocketEntry", + "Linker": "*SocketEntry", + }, +) + proto_library( name = "uncaught_signal_proto", srcs = ["uncaught_signal.proto"], @@ -104,6 +116,7 @@ go_library( "sessions.go", "signal.go", "signal_handlers.go", + "socket_list.go", "syscalls.go", "syscalls_state.go", "syslog.go", diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index bbacba1f4..43ae22a5d 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -156,6 +156,8 @@ var cycleMu sync.Mutex func NewEventPoll(ctx context.Context) *fs.File { // name matches fs/eventpoll.c:epoll_create1. dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) + // Release the initial dirent reference after NewFile takes a reference. + defer dirent.DecRef() return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ files: make(map[FileIdentifier]*pollEntry), }) diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 2f900be38..fe474cbf0 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -69,6 +69,8 @@ type EventOperations struct { func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { // name matches fs/eventfd.c:eventfd_file_create. dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]") + // Release the initial dirent reference after NewFile takes a reference. + defer dirent.DecRef() return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ val: initVal, semMode: semMode, diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 85d73ace2..f253a81d9 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -182,9 +182,13 @@ type Kernel struct { // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` - // socketTable is used to track all sockets on the system. Protected by + // sockets is the list of all network sockets the system. Protected by // extMu. - socketTable map[int]map[*refs.WeakRef]struct{} + sockets socketList + + // nextSocketEntry is the next entry number to use in sockets. Protected + // by extMu. + nextSocketEntry uint64 // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -283,7 +287,6 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() - k.socketTable = make(map[int]map[*refs.WeakRef]struct{}) return nil } @@ -1137,51 +1140,43 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { }) } -// socketEntry represents a socket recorded in Kernel.socketTable. It implements +// SocketEntry represents a socket recorded in Kernel.sockets. It implements // refs.WeakRefUser for sockets stored in the socket table. // // +stateify savable -type socketEntry struct { - k *Kernel - sock *refs.WeakRef - family int +type SocketEntry struct { + socketEntry + k *Kernel + Sock *refs.WeakRef + ID uint64 // Socket table entry number. } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (s *socketEntry) WeakRefGone() { +func (s *SocketEntry) WeakRefGone() { s.k.extMu.Lock() - // k.socketTable is guaranteed to point to a valid socket table for s.family - // at this point, since we made sure of the fact when we created this - // socketEntry, and we never delete socket tables. - delete(s.k.socketTable[s.family], s.sock) + s.k.sockets.Remove(s) s.k.extMu.Unlock() } // RecordSocket adds a socket to the system-wide socket table for tracking. // // Precondition: Caller must hold a reference to sock. -func (k *Kernel) RecordSocket(sock *fs.File, family int) { +func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Lock() - table, ok := k.socketTable[family] - if !ok { - table = make(map[*refs.WeakRef]struct{}) - k.socketTable[family] = table - } - se := socketEntry{k: k, family: family} - se.sock = refs.NewWeakRef(sock, &se) - table[se.sock] = struct{}{} + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{k: k, ID: id} + s.Sock = refs.NewWeakRef(sock, s) + k.sockets.PushBack(s) k.extMu.Unlock() } -// ListSockets returns a snapshot of all sockets of a given family. -func (k *Kernel) ListSockets(family int) []*refs.WeakRef { +// ListSockets returns a snapshot of all sockets. +func (k *Kernel) ListSockets() []*SocketEntry { k.extMu.Lock() - socks := []*refs.WeakRef{} - if table, ok := k.socketTable[family]; ok { - socks = make([]*refs.WeakRef, 0, len(table)) - for s := range table { - socks = append(socks, s) - } + var socks []*SocketEntry + for s := k.sockets.Front(); s != nil; s = s.Next() { + socks = append(socks, s) } k.extMu.Unlock() return socks diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 926c4c623..dc7da529e 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -38,7 +38,11 @@ type inodeOperations struct { fsutil.InodeNotMappable `state:"nosave"` fsutil.InodeNotSocket `state:"nosave"` fsutil.InodeNotSymlink `state:"nosave"` - fsutil.InodeNotVirtual `state:"nosave"` + + // Marking pipe inodes as virtual allows them to be saved and restored + // even if they have been unlinked. We can get away with this because + // their state exists entirely within the sentry. + fsutil.InodeVirtual `state:"nosave"` fsutil.InodeSimpleAttributes @@ -86,7 +90,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi switch { case flags.Read && !flags.Write: // O_RDONLY. - r := i.p.Open(ctx, flags) + r := i.p.Open(ctx, d, flags) i.newHandleLocked(&i.rWakeup) if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { @@ -102,7 +106,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi return r, nil case flags.Write && !flags.Read: // O_WRONLY. - w := i.p.Open(ctx, flags) + w := i.p.Open(ctx, d, flags) i.newHandleLocked(&i.wWakeup) if i.p.isNamed && !i.p.HasReaders() { @@ -122,7 +126,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi case flags.Read && flags.Write: // O_RDWR. // Pipes opened for read-write always succeeds without blocking. - rw := i.p.Open(ctx, flags) + rw := i.p.Open(ctx, d, flags) i.newHandleLocked(&i.rWakeup) i.newHandleLocked(&i.wWakeup) return rw, nil diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index 31d9b0443..9a946b380 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -62,7 +62,9 @@ var perms fs.FilePermissions = fs.FilePermissions{ } func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) { - file, err := n.GetFile(ctx, nil, flags) + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) + d := fs.NewDirent(inode, "pipe") + file, err := n.GetFile(ctx, d, flags) if err != nil { t.Fatalf("open with flags %+v failed: %v", flags, err) } @@ -73,7 +75,9 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag } func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) { - file, err := n.GetFile(ctx, nil, flags) + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) + d := fs.NewDirent(inode, "pipe") + file, err := n.GetFile(ctx, d, flags) if resChan != nil { resChan <- openResult{file, err} } diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index b65204492..73438dc62 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -71,11 +71,6 @@ type Pipe struct { // This value is immutable. atomicIOBytes int64 - // The dirent backing this pipe. Shared by all readers and writers. - // - // This value is immutable. - Dirent *fs.Dirent - // The number of active readers for this pipe. // // Access atomically. @@ -130,14 +125,20 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) if atomicIOBytes > sizeBytes { atomicIOBytes = sizeBytes } - p := &Pipe{ + return &Pipe{ isNamed: isNamed, max: sizeBytes, atomicIOBytes: atomicIOBytes, } +} - // Build the fs.Dirent of this pipe, shared by all fs.Files associated - // with this pipe. +// NewConnectedPipe initializes a pipe and returns a pair of objects +// representing the read and write ends of the pipe. +func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { + p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes) + + // Build an fs.Dirent for the pipe which will be shared by both + // returned files. perms := fs.FilePermissions{ User: fs.PermMask{Read: true, Write: true}, } @@ -150,36 +151,32 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64) BlockSize: int64(atomicIOBytes), } ms := fs.NewPseudoMountSource() - p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) - return p -} - -// NewConnectedPipe initializes a pipe and returns a pair of objects -// representing the read and write ends of the pipe. -func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) { - p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes) - return p.Open(ctx, fs.FileFlags{Read: true}), p.Open(ctx, fs.FileFlags{Write: true}) + d := fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) + // The p.Open calls below will each take a reference on the Dirent. We + // must drop the one we already have. + defer d.DecRef() + return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) } // Open opens the pipe and returns a new file. // // Precondition: at least one of flags.Read or flags.Write must be set. -func (p *Pipe) Open(ctx context.Context, flags fs.FileFlags) *fs.File { +func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.File { switch { case flags.Read && flags.Write: p.rOpen() p.wOpen() - return fs.NewFile(ctx, p.Dirent, flags, &ReaderWriter{ + return fs.NewFile(ctx, d, flags, &ReaderWriter{ Pipe: p, }) case flags.Read: p.rOpen() - return fs.NewFile(ctx, p.Dirent, flags, &Reader{ + return fs.NewFile(ctx, d, flags, &Reader{ ReaderWriter: ReaderWriter{Pipe: p}, }) case flags.Write: p.wOpen() - return fs.NewFile(ctx, p.Dirent, flags, &Writer{ + return fs.NewFile(ctx, d, flags, &Writer{ ReaderWriter: ReaderWriter{Pipe: p}, }) default: diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 4423e7efd..193447b17 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -19,6 +19,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserror" ) @@ -92,6 +93,14 @@ const ( // ptrace(2), subsection "Ptrace access mode checking". If attach is true, it // checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access // mode PTRACE_MODE_READ. +// +// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a +// racing setuid(2) may change traceability). This may pose a risk when a task +// changes from traceable to not traceable. This is only problematic across +// execve, where privileges may increase. +// +// We currently do not implement privileged executables (set-user/group-ID bits +// and file capabilities), so that case is not reachable. func (t *Task) CanTrace(target *Task, attach bool) bool { // "1. If the calling thread and the target thread are in the same thread // group, access is always allowed." - ptrace(2) @@ -162,7 +171,13 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { return false } - // TODO(b/31916171): dumpability check + var targetMM *mm.MemoryManager + target.WithMuLocked(func(t *Task) { + targetMM = t.MemoryManager() + }) + if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable { + return false + } if callerCreds.UserNamespace != targetCreds.UserNamespace { return false } diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 0572053db..27cd3728b 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -32,6 +32,51 @@ import ( // syscall. const maxSyscallNum = 2000 +// SyscallSupportLevel is a syscall support levels. +type SyscallSupportLevel int + +// String returns a human readable represetation of the support level. +func (l SyscallSupportLevel) String() string { + switch l { + case SupportUnimplemented: + return "Unimplemented" + case SupportPartial: + return "Partial Support" + case SupportFull: + return "Full Support" + default: + return "Undocumented" + } +} + +const ( + // SupportUndocumented indicates the syscall is not documented yet. + SupportUndocumented = iota + + // SupportUnimplemented indicates the syscall is unimplemented. + SupportUnimplemented + + // SupportPartial indicates the syscall is partially supported. + SupportPartial + + // SupportFull indicates the syscall is fully supported. + SupportFull +) + +// Syscall includes the syscall implementation and compatibility information. +type Syscall struct { + // Name is the syscall name. + Name string + // Fn is the implementation of the syscall. + Fn SyscallFn + // SupportLevel is the level of support implemented in gVisor. + SupportLevel SyscallSupportLevel + // Note describes the compatibility of the syscall. + Note string + // URLs is set of URLs to any relevant bugs or issues. + URLs []string +} + // SyscallFn is a syscall implementation. type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) @@ -83,7 +128,7 @@ type SyscallFlagsTable struct { // Init initializes the struct, with all syscalls in table set to enable. // // max is the largest syscall number in table. -func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) { +func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) { e.enable = make([]uint32, max+1) for num := range table { e.enable[num] = syscallPresent @@ -194,7 +239,7 @@ type SyscallTable struct { AuditNumber uint32 `state:"manual"` // Table is the collection of functions. - Table map[uintptr]SyscallFn `state:"manual"` + Table map[uintptr]Syscall `state:"manual"` // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. @@ -247,7 +292,7 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { func RegisterSyscallTable(s *SyscallTable) { if s.Table == nil { // Ensure non-nil lookup table. - s.Table = make(map[uintptr]SyscallFn) + s.Table = make(map[uintptr]Syscall) } if s.Emulate == nil { // Ensure non-nil emulate table. @@ -268,8 +313,8 @@ func RegisterSyscallTable(s *SyscallTable) { s.lookup = make([]SyscallFn, max+1) // Initialize the fast-lookup table. - for num, fn := range s.Table { - s.lookup[num] = fn + for num, sc := range s.Table { + s.lookup[num] = sc.Fn } s.FeatureEnable.init(s.Table, max) @@ -303,5 +348,8 @@ func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { // mapLookup is similar to Lookup, except that it only uses the syscall table, // that is, it skips the fast look array. This is available for benchmarking. func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { - return s.Table[sysno] + if sc, ok := s.Table[sysno]; ok { + return sc.Fn + } + return nil } diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go index 8f7cdb9f3..3f2b042c8 100644 --- a/pkg/sentry/kernel/table_test.go +++ b/pkg/sentry/kernel/table_test.go @@ -26,11 +26,13 @@ const ( ) func createSyscallTable() *SyscallTable { - m := make(map[uintptr]SyscallFn) + m := make(map[uintptr]Syscall) for i := uintptr(0); i <= maxTestSyscall; i++ { j := i - m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) { - return j, nil, nil + m[i] = Syscall{ + Fn: func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) { + return j, nil, nil + }, } } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index f9378c2de..4d889422f 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -455,12 +455,13 @@ type Task struct { // single numa node, all policies are no-ops. We only track this information // so that we can return reasonable values if the application calls // get_mempolicy(2) after setting a non-default policy. Note that in the - // real syscall, nodemask can be longer than 4 bytes, but we always report a - // single node so never need to save more than a single bit. + // real syscall, nodemask can be longer than a single unsigned long, but we + // always report a single node so never need to save more than a single + // bit. // // numaPolicy and numaNodeMask are protected by mu. numaPolicy int32 - numaNodeMask uint32 + numaNodeMask uint64 // If netns is true, the task is in a non-root network namespace. Network // namespaces aren't currently implemented in full; being in a network diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 5d1425d5c..35d5cb90c 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -68,6 +68,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" "gvisor.googlesource.com/gvisor/pkg/syserror" ) @@ -198,6 +199,12 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { return flags.CloseOnExec }) + // NOTE(b/30815691): We currently do not implement privileged + // executables (set-user/group-ID bits and file capabilities). This + // allows us to unconditionally enable user dumpability on the new mm. + // See fs/exec.c:setup_new_exec. + r.tc.MemoryManager.SetDumpability(mm.UserDumpable) + // Switch to the new process. t.MemoryManager().Deactivate() t.mu.Lock() diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go index 17f08729a..ec95f78d0 100644 --- a/pkg/sentry/kernel/task_identity.go +++ b/pkg/sentry/kernel/task_identity.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" "gvisor.googlesource.com/gvisor/pkg/syserror" ) @@ -206,8 +207,17 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { // (filesystem UIDs aren't implemented, nor are any of the capabilities in // question) - // Not documented, but compare Linux's kernel/cred.c:commit_creds(). if oldE != newE { + // "[dumpability] is reset to the current value contained in + // the file /proc/sys/fs/suid_dumpable (which by default has + // the value 0), in the following circumstances: The process's + // effective user or group ID is changed." - prctl(2) + // + // (suid_dumpable isn't implemented, so we just use the + // default. + t.MemoryManager().SetDumpability(mm.NotDumpable) + + // Not documented, but compare Linux's kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } } @@ -303,8 +313,18 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { t.creds = t.creds.Fork() // See doc for creds. t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS - // Not documented, but compare Linux's kernel/cred.c:commit_creds(). if oldE != newE { + // "[dumpability] is reset to the current value contained in + // the file /proc/sys/fs/suid_dumpable (which by default has + // the value 0), in the following circumstances: The process's + // effective user or group ID is changed." - prctl(2) + // + // (suid_dumpable isn't implemented, so we just use the + // default. + t.MemoryManager().SetDumpability(mm.NotDumpable) + + // Not documented, but compare Linux's + // kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } } diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 5455f6ea9..1c94ab11b 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -622,14 +622,14 @@ func (t *Task) SetNiceness(n int) { } // NumaPolicy returns t's current numa policy. -func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) { +func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() return t.numaPolicy, t.numaNodeMask } // SetNumaPolicy sets t's numa policy. -func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) { +func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) { t.mu.Lock() defer t.mu.Unlock() t.numaPolicy = policy diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD deleted file mode 100644 index 68b03d4cc..000000000 --- a/pkg/sentry/memutil/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "memutil", - srcs = [ - "memutil.go", - "memutil_unsafe.go", - ], - importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memutil", - visibility = ["//pkg/sentry:internal"], - deps = ["@org_golang_x_sys//unix:go_default_library"], -) diff --git a/pkg/sentry/memutil/memutil.go b/pkg/sentry/memutil/memutil.go deleted file mode 100644 index a4154c42a..000000000 --- a/pkg/sentry/memutil/memutil.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package memutil contains the utility functions for memory operations. -package memutil diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/sentry/memutil/memutil_unsafe.go deleted file mode 100644 index 92eab8a26..000000000 --- a/pkg/sentry/memutil/memutil_unsafe.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memutil - -import ( - "fmt" - "syscall" - "unsafe" - - "golang.org/x/sys/unix" -) - -// CreateMemFD creates a memfd file and returns the fd. -func CreateMemFD(name string, flags int) (int, error) { - p, err := syscall.BytePtrFromString(name) - if err != nil { - return -1, err - } - fd, _, e := syscall.Syscall(unix.SYS_MEMFD_CREATE, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) - if e != 0 { - if e == syscall.ENOSYS { - return -1, fmt.Errorf("memfd_create(2) is not implemented. Check that you have Linux 3.17 or higher") - } - return -1, e - } - return int(fd), nil -} diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 7a65a62a2..7646d5ab2 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -37,6 +37,7 @@ func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *Memo privateRefs: &privateRefs{}, users: 1, auxv: arch.Auxv{}, + dumpability: UserDumpable, aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, } } @@ -79,8 +80,9 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { envv: mm.envv, auxv: append(arch.Auxv(nil), mm.auxv...), // IncRef'd below, once we know that there isn't an error. - executable: mm.executable, - aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + executable: mm.executable, + dumpability: mm.dumpability, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, } // Copy vmas. diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index 9768e51f1..c218006ee 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -20,6 +20,36 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) +// Dumpability describes if and how core dumps should be created. +type Dumpability int + +const ( + // NotDumpable indicates that core dumps should never be created. + NotDumpable Dumpability = iota + + // UserDumpable indicates that core dumps should be created, owned by + // the current user. + UserDumpable + + // RootDumpable indicates that core dumps should be created, owned by + // root. + RootDumpable +) + +// Dumpability returns the dumpability. +func (mm *MemoryManager) Dumpability() Dumpability { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.dumpability +} + +// SetDumpability sets the dumpability. +func (mm *MemoryManager) SetDumpability(d Dumpability) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.dumpability = d +} + // ArgvStart returns the start of the application argument vector. // // There is no guarantee that this value is sensible w.r.t. ArgvEnd. diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index eb6defa2b..604866d04 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -219,6 +219,12 @@ type MemoryManager struct { // executable is protected by metadataMu. executable *fs.Dirent + // dumpability describes if and how this MemoryManager may be dumped to + // userspace. + // + // dumpability is protected by metadataMu. + dumpability Dumpability + // aioManager keeps track of AIOContexts used for async IOs. AIOManager // must be cloned when CLONE_VM is used. aioManager aioManager @@ -270,6 +276,12 @@ type vma struct { mlockMode memmap.MLockMode + // numaPolicy is the NUMA policy for this vma set by mbind(). + numaPolicy int32 + + // numaNodemask is the NUMA nodemask for this vma set by mbind(). + numaNodemask uint64 + // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 0368c6794..9cf136532 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -470,6 +470,16 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi return 0, syserror.EINVAL } + // Check that the new region is valid. + _, err := mm.findAvailableLocked(newSize, findAvailableOpts{ + Addr: newAddr, + Fixed: true, + Unmap: true, + }) + if err != nil { + return 0, err + } + // Unmap any mappings at the destination. mm.unmapLocked(ctx, newAR) @@ -963,6 +973,59 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error return nil } +// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR). +func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + vseg := mm.vmas.FindSegment(addr) + if !vseg.Ok() { + return 0, 0, syserror.EFAULT + } + vma := vseg.ValuePtr() + return vma.numaPolicy, vma.numaNodemask, nil +} + +// SetNumaPolicy implements the semantics of Linux's mbind(). +func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error { + if !addr.IsPageAligned() { + return syserror.EINVAL + } + // Linux allows this to overflow. + la, _ := usermem.Addr(length).RoundUp() + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + if ar.Length() == 0 { + return nil + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + }() + vseg := mm.vmas.LowerBoundSegment(ar.Start) + lastEnd := ar.Start + for { + if !vseg.Ok() || lastEnd < vseg.Start() { + // "EFAULT: ... there was an unmapped hole in the specified memory + // range specified [sic] by addr and len." - mbind(2) + return syserror.EFAULT + } + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + vma.numaPolicy = policy + vma.numaNodemask = nodemask + lastEnd = vseg.End() + if ar.End <= lastEnd { + return nil + } + vseg, _ = vseg.NextNonEmpty() + } +} + // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { ar, ok := addr.ToRange(length) diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 02203f79f..0af8de5b0 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -107,6 +107,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp private: opts.Private, growsDown: opts.GrowsDown, mlockMode: opts.MLockMode, + numaPolicy: linux.MPOL_DEFAULT, id: opts.MappingIdentity, hint: opts.Hint, } @@ -436,6 +437,8 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || vma1.mlockMode != vma2.mlockMode || + vma1.numaPolicy != vma2.numaPolicy || + vma1.numaNodemask != vma2.numaNodemask || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 8a8a0e4e4..ca2d5ba6f 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -63,9 +63,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", + "//pkg/memutil", "//pkg/sentry/arch", "//pkg/sentry/context", - "//pkg/sentry/memutil", + "//pkg/sentry/hostmm", "//pkg/sentry/platform", "//pkg/sentry/safemem", "//pkg/sentry/usage", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 2b9924ad7..6d91f1a7b 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -32,6 +32,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" @@ -162,6 +163,11 @@ type MemoryFile struct { // evictionWG counts the number of goroutines currently performing evictions. evictionWG sync.WaitGroup + + // stopNotifyPressure stops memory cgroup pressure level + // notifications used to drive eviction. stopNotifyPressure is + // immutable. + stopNotifyPressure func() } // MemoryFileOpts provides options to NewMemoryFile. @@ -169,6 +175,11 @@ type MemoryFileOpts struct { // DelayedEviction controls the extent to which the MemoryFile may delay // eviction of evictable allocations. DelayedEviction DelayedEvictionType + + // If UseHostMemcgPressure is true, use host memory cgroup pressure level + // notifications to determine when eviction is necessary. This option has + // no effect unless DelayedEviction is DelayedEvictionEnabled. + UseHostMemcgPressure bool } // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. @@ -186,9 +197,14 @@ const ( // evictable allocations until doing so is considered necessary to avoid // performance degradation due to host memory pressure, or OOM kills. // - // As of this writing, DelayedEvictionEnabled delays evictions until the - // reclaimer goroutine is out of work (pages to reclaim), then evicts all - // pending evictable allocations immediately. + // As of this writing, the behavior of DelayedEvictionEnabled depends on + // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled: + // + // - If UseHostMemcgPressure is true, evictions are delayed until memory + // pressure is indicated. + // + // - Otherwise, evictions are only delayed until the reclaimer goroutine + // is out of work (pages to reclaim). DelayedEvictionEnabled // DelayedEvictionManual requires that evictable allocations are only @@ -292,6 +308,22 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { } f.mappings.Store(make([]uintptr, initialSize/chunkSize)) f.reclaimCond.L = &f.mu + + if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { + stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() { + f.mu.Lock() + startedAny := f.startEvictionsLocked() + f.mu.Unlock() + if startedAny { + log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure") + } + }, "low") + if err != nil { + return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err) + } + f.stopNotifyPressure = stop + } + go f.runReclaim() // S/R-SAFE: f.mu // The Linux kernel contains an optional feature called "Integrity @@ -692,9 +724,11 @@ func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) // Kick off eviction immediately. f.startEvictionGoroutineLocked(user, info) case DelayedEvictionEnabled: - // Ensure that the reclaimer goroutine is running, so that it can - // start eviction when necessary. - f.reclaimCond.Signal() + if !f.opts.UseHostMemcgPressure { + // Ensure that the reclaimer goroutine is running, so that it + // can start eviction when necessary. + f.reclaimCond.Signal() + } } } } @@ -992,11 +1026,12 @@ func (f *MemoryFile) runReclaim() { } f.markReclaimed(fr) } + // We only get here if findReclaimable finds f.destroyed set and returns // false. f.mu.Lock() - defer f.mu.Unlock() if !f.destroyed { + f.mu.Unlock() panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") } f.file.Close() @@ -1016,6 +1051,13 @@ func (f *MemoryFile) runReclaim() { } // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) f.mappings.Store([]uintptr{}) + f.mu.Unlock() + + // This must be called without holding f.mu to avoid circular lock + // ordering. + if f.stopNotifyPressure != nil { + f.stopNotifyPressure() + } } func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { @@ -1029,7 +1071,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { if f.reclaimable { break } - if f.opts.DelayedEviction == DelayedEvictionEnabled { + if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure { // No work to do. Evict any pending evictable allocations to // get more reclaimable pages before going to sleep. f.startEvictionsLocked() @@ -1089,14 +1131,17 @@ func (f *MemoryFile) StartEvictions() { } // Preconditions: f.mu must be locked. -func (f *MemoryFile) startEvictionsLocked() { +func (f *MemoryFile) startEvictionsLocked() bool { + startedAny := false for user, info := range f.evictable { // Don't start multiple goroutines to evict the same user's // allocations. if !info.evicting { f.startEvictionGoroutineLocked(user, info) + startedAny = true } } + return startedAny } // Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 9999e58f4..2931d6ddc 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -32,10 +32,10 @@ go_library( "//pkg/atomicbitops", "//pkg/cpuid", "//pkg/log", + "//pkg/procid", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/procid", "//pkg/sentry/platform/ring0", "//pkg/sentry/platform/ring0/pagetables", "//pkg/sentry/platform/safecopy", diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index f5953b96e..f8ccd86af 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -23,7 +23,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/atomicbitops" "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" + "gvisor.googlesource.com/gvisor/pkg/procid" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/sentry/platform/procid/BUILD deleted file mode 100644 index 277509624..000000000 --- a/pkg/sentry/platform/procid/BUILD +++ /dev/null @@ -1,33 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "procid", - srcs = [ - "procid.go", - "procid_amd64.s", - "procid_arm64.s", - ], - importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid", - visibility = ["//pkg/sentry:internal"], -) - -go_test( - name = "procid_test", - size = "small", - srcs = [ - "procid_test.go", - ], - embed = [":procid"], -) - -go_test( - name = "procid_net_test", - size = "small", - srcs = [ - "procid_net_test.go", - "procid_test.go", - ], - embed = [":procid"], -) diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go deleted file mode 100644 index 78b92422c..000000000 --- a/pkg/sentry/platform/procid/procid.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package procid provides a way to get the current system thread identifier. -package procid - -// Current returns the current system thread identifier. -// -// Precondition: This should only be called with the runtime OS thread locked. -func Current() uint64 diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s deleted file mode 100644 index 30ec8e6e2..000000000 --- a/pkg/sentry/platform/procid/procid_amd64.s +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 -// +build go1.8 -// +build !go1.14 - -#include "textflag.h" - -TEXT ·Current(SB),NOSPLIT,$0-8 - // The offset specified here is the m_procid offset for Go1.8+. - // Changes to this offset should be caught by the tests, and major - // version changes require an explicit tag change above. - MOVQ TLS, AX - MOVQ 0(AX)(TLS*1), AX - MOVQ 48(AX), AX // g_m (may change in future versions) - MOVQ 72(AX), AX // m_procid (may change in future versions) - MOVQ AX, ret+0(FP) - RET diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s deleted file mode 100644 index e340d9f98..000000000 --- a/pkg/sentry/platform/procid/procid_arm64.s +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 -// +build go1.8 -// +build !go1.14 - -#include "textflag.h" - -TEXT ·Current(SB),NOSPLIT,$0-8 - // The offset specified here is the m_procid offset for Go1.8+. - // Changes to this offset should be caught by the tests, and major - // version changes require an explicit tag change above. - MOVD g, R0 // g - MOVD 48(R0), R0 // g_m (may change in future versions) - MOVD 72(R0), R0 // m_procid (may change in future versions) - MOVD R0, ret+0(FP) - RET diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/sentry/platform/procid/procid_net_test.go deleted file mode 100644 index b628e2285..000000000 --- a/pkg/sentry/platform/procid/procid_net_test.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package procid - -// This file is just to force the inclusion of the "net" package, which will -// make the test binary a cgo one. -import ( - _ "net" -) diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/sentry/platform/procid/procid_test.go deleted file mode 100644 index 88dd0b3ae..000000000 --- a/pkg/sentry/platform/procid/procid_test.go +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package procid - -import ( - "os" - "runtime" - "sync" - "syscall" - "testing" -) - -// runOnMain is used to send functions to run on the main (initial) thread. -var runOnMain = make(chan func(), 10) - -func checkProcid(t *testing.T, start *sync.WaitGroup, done *sync.WaitGroup) { - defer done.Done() - - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - start.Done() - start.Wait() - - procID := Current() - tid := syscall.Gettid() - - if procID != uint64(tid) { - t.Logf("Bad procid: expected %v, got %v", tid, procID) - t.Fail() - } -} - -func TestProcidInitialized(t *testing.T) { - var start sync.WaitGroup - var done sync.WaitGroup - - count := 100 - start.Add(count + 1) - done.Add(count + 1) - - // Run the check on the main thread. - // - // When cgo is not included, the only case when procid isn't initialized - // is in the main (initial) thread, so we have to test this case - // specifically. - runOnMain <- func() { - checkProcid(t, &start, &done) - } - - // Run the check on a number of different threads. - for i := 0; i < count; i++ { - go checkProcid(t, &start, &done) - } - - done.Wait() -} - -func TestMain(m *testing.M) { - // Make sure we remain at the main (initial) thread. - runtime.LockOSThread() - - // Start running tests in a different goroutine. - go func() { - os.Exit(m.Run()) - }() - - // Execute any functions that have been sent for execution in the main - // thread. - for f := range runOnMain { - f() - } -} diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index e9e4a0d16..434d003a3 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -20,11 +20,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/log", + "//pkg/procid", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/procid", "//pkg/sentry/platform/safecopy", "//pkg/sentry/usermem", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 83b43057f..d7800a55e 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -21,9 +21,10 @@ import ( "sync" "syscall" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/procid" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) @@ -300,6 +301,18 @@ const ( killed ) +func (t *thread) dumpAndPanic(message string) { + var regs syscall.PtraceRegs + message += "\n" + if err := t.getRegs(®s); err == nil { + message += dumpRegs(®s) + } else { + log.Warningf("unable to get registers: %v", err) + } + message += fmt.Sprintf("stubStart\t = %016x\n", stubStart) + panic(message) +} + // wait waits for a stop event. // // Precondition: outcome is a valid waitOutcome. @@ -320,7 +333,7 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal { switch outcome { case stopped: if !status.Stopped() { - panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) + t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status)) } stopSig := status.StopSignal() if stopSig == 0 { @@ -334,12 +347,12 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal { return stopSig case killed: if !status.Exited() && !status.Signaled() { - panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) + t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status)) } return syscall.Signal(status.ExitStatus()) default: // Should not happen. - panic(fmt.Sprintf("unknown outcome: %v", outcome)) + t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome)) } } } diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 77a0e908f..fdd21c8f8 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -17,6 +17,8 @@ package ptrace import ( + "fmt" + "strings" "syscall" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" @@ -102,3 +104,38 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) { } return uintptr(rval), nil } + +func dumpRegs(regs *syscall.PtraceRegs) string { + var m strings.Builder + + fmt.Fprintf(&m, "Registers:\n") + fmt.Fprintf(&m, "\tR15\t = %016x\n", regs.R15) + fmt.Fprintf(&m, "\tR14\t = %016x\n", regs.R14) + fmt.Fprintf(&m, "\tR13\t = %016x\n", regs.R13) + fmt.Fprintf(&m, "\tR12\t = %016x\n", regs.R12) + fmt.Fprintf(&m, "\tRbp\t = %016x\n", regs.Rbp) + fmt.Fprintf(&m, "\tRbx\t = %016x\n", regs.Rbx) + fmt.Fprintf(&m, "\tR11\t = %016x\n", regs.R11) + fmt.Fprintf(&m, "\tR10\t = %016x\n", regs.R10) + fmt.Fprintf(&m, "\tR9\t = %016x\n", regs.R9) + fmt.Fprintf(&m, "\tR8\t = %016x\n", regs.R8) + fmt.Fprintf(&m, "\tRax\t = %016x\n", regs.Rax) + fmt.Fprintf(&m, "\tRcx\t = %016x\n", regs.Rcx) + fmt.Fprintf(&m, "\tRdx\t = %016x\n", regs.Rdx) + fmt.Fprintf(&m, "\tRsi\t = %016x\n", regs.Rsi) + fmt.Fprintf(&m, "\tRdi\t = %016x\n", regs.Rdi) + fmt.Fprintf(&m, "\tOrig_rax = %016x\n", regs.Orig_rax) + fmt.Fprintf(&m, "\tRip\t = %016x\n", regs.Rip) + fmt.Fprintf(&m, "\tCs\t = %016x\n", regs.Cs) + fmt.Fprintf(&m, "\tEflags\t = %016x\n", regs.Eflags) + fmt.Fprintf(&m, "\tRsp\t = %016x\n", regs.Rsp) + fmt.Fprintf(&m, "\tSs\t = %016x\n", regs.Ss) + fmt.Fprintf(&m, "\tFs_base\t = %016x\n", regs.Fs_base) + fmt.Fprintf(&m, "\tGs_base\t = %016x\n", regs.Gs_base) + fmt.Fprintf(&m, "\tDs\t = %016x\n", regs.Ds) + fmt.Fprintf(&m, "\tEs\t = %016x\n", regs.Es) + fmt.Fprintf(&m, "\tFs\t = %016x\n", regs.Fs) + fmt.Fprintf(&m, "\tGs\t = %016x\n", regs.Gs) + + return m.String() +} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 2c07b4ac3..914be7486 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -22,9 +22,9 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/procid" "gvisor.googlesource.com/gvisor/pkg/seccomp" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" - "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" ) const syscallEvent syscall.Signal = 0x80 @@ -142,7 +142,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // down available calls only to what is needed. rules := []seccomp.RuleSet{ // Rules for trapping vsyscall access. - seccomp.RuleSet{ + { Rules: seccomp.SyscallRules{ syscall.SYS_GETTIMEOFDAY: {}, syscall.SYS_TIME: {}, diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index c0238691d..434d7ca2e 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -406,12 +406,20 @@ func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials { return nil } if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) { - tcred := t.Credentials() - return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} + return MakeCreds(t) } return nil } +// MakeCreds creates default SCMCredentials. +func MakeCreds(t *kernel.Task) SCMCredentials { + if t == nil { + return nil + } + tcred := t.Credentials() + return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} +} + // New creates default control messages if needed. func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages { return transport.ControlMessages{ diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD index 44bb97b5b..7e2679ea0 100644 --- a/pkg/sentry/socket/epsocket/BUILD +++ b/pkg/sentry/socket/epsocket/BUILD @@ -32,7 +32,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", - "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go index de4b963da..a50798cb3 100644 --- a/pkg/sentry/socket/epsocket/epsocket.go +++ b/pkg/sentry/socket/epsocket/epsocket.go @@ -44,7 +44,6 @@ import ( ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserr" @@ -52,6 +51,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp" "gvisor.googlesource.com/gvisor/pkg/waiter" ) @@ -227,7 +227,8 @@ type SocketOperations struct { family int Endpoint tcpip.Endpoint - skType transport.SockType + skType linux.SockType + protocol int // readMu protects access to the below fields. readMu sync.Mutex `state:"nosave"` @@ -252,8 +253,8 @@ type SocketOperations struct { } // New creates a new endpoint socket. -func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { - if skType == transport.SockStream { +func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) { + if skType == linux.SOCK_STREAM { if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil { return nil, syserr.TranslateNetstackError(err) } @@ -266,6 +267,7 @@ func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Qu family: family, Endpoint: endpoint, skType: skType, + protocol: protocol, }), nil } @@ -550,7 +552,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - ns, err := New(t, s.family, s.skType, wq, ep) + ns, err := New(t, s.family, s.skType, s.protocol, wq, ep) if err != nil { return 0, nil, 0, err } @@ -578,7 +580,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits()) - t.Kernel().RecordSocket(ns, s.family) + t.Kernel().RecordSocket(ns) return fd, addr, addrLen, syserr.FromError(e) } @@ -637,7 +639,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) ( // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -663,7 +665,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_TYPE: @@ -918,6 +920,30 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa t.Kernel().EmitUnimplementedEvent(t) + case linux.TCP_CONGESTION: + if outLen <= 0 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.CongestionControlOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + // We match linux behaviour here where it returns the lower of + // TCP_CA_NAME_MAX bytes or the value of the option length. + // + // This is Linux's net/tcp.h TCP_CA_NAME_MAX. + const tcpCANameMax = 16 + + toCopy := tcpCANameMax + if outLen < tcpCANameMax { + toCopy = outLen + } + b := make([]byte, toCopy) + copy(b, v) + return b, nil + default: emitUnimplementedEventTCP(t, name) } @@ -1220,6 +1246,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)))) + case linux.TCP_CONGESTION: + v := tcpip.CongestionControlOption(optVal) + if err := ep.SetSockOpt(v); err != nil { + return syserr.TranslateNetstackError(err) + } + return nil case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) @@ -2281,3 +2313,51 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { } return rv } + +// State implements socket.Socket.State. State translates the internal state +// returned by netstack to values defined by Linux. +func (s *SocketOperations) State() uint32 { + if s.family != linux.AF_INET && s.family != linux.AF_INET6 { + // States not implemented for this socket's family. + return 0 + } + + if !s.isPacketBased() { + // TCP socket. + switch tcp.EndpointState(s.Endpoint.State()) { + case tcp.StateEstablished: + return linux.TCP_ESTABLISHED + case tcp.StateSynSent: + return linux.TCP_SYN_SENT + case tcp.StateSynRecv: + return linux.TCP_SYN_RECV + case tcp.StateFinWait1: + return linux.TCP_FIN_WAIT1 + case tcp.StateFinWait2: + return linux.TCP_FIN_WAIT2 + case tcp.StateTimeWait: + return linux.TCP_TIME_WAIT + case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: + return linux.TCP_CLOSE + case tcp.StateCloseWait: + return linux.TCP_CLOSE_WAIT + case tcp.StateLastAck: + return linux.TCP_LAST_ACK + case tcp.StateListen: + return linux.TCP_LISTEN + case tcp.StateClosing: + return linux.TCP_CLOSING + default: + // Internal or unknown state. + return 0 + } + } + + // TODO(b/112063468): Export states for UDP, ICMP, and raw sockets. + return 0 +} + +// Type implements socket.Socket.Type. +func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { + return s.family, s.skType, s.protocol +} diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go index ec930d8d5..516582828 100644 --- a/pkg/sentry/socket/epsocket/provider.go +++ b/pkg/sentry/socket/epsocket/provider.go @@ -23,7 +23,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/header" @@ -42,7 +41,7 @@ type provider struct { // getTransportProtocol figures out transport protocol. Currently only TCP, // UDP, and ICMP are supported. -func getTransportProtocol(ctx context.Context, stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) { +func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) { switch stype { case linux.SOCK_STREAM: if protocol != 0 && protocol != syscall.IPPROTO_TCP { @@ -80,7 +79,7 @@ func getTransportProtocol(ctx context.Context, stype transport.SockType, protoco } // Socket creates a new socket object for the AF_INET or AF_INET6 family. -func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Fail right away if we don't have a stack. stack := t.NetworkContext() if stack == nil { @@ -112,11 +111,11 @@ func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int return nil, syserr.TranslateNetstackError(e) } - return New(t, p.family, stype, wq, ep) + return New(t, p.family, stype, protocol, wq, ep) } // Pair just returns nil sockets (not supported). -func (*provider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) { +func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index a469af7ac..975f47bc3 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -30,7 +30,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/safemem", "//pkg/sentry/socket", - "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 41f9693bb..c62c8d8f1 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -19,7 +19,9 @@ import ( "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/binary" "gvisor.googlesource.com/gvisor/pkg/fdnotifier" + "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" @@ -28,7 +30,6 @@ import ( ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/syserror" @@ -55,15 +56,22 @@ type socketOperations struct { fsutil.FileUseInodeUnstableAttr `state:"nosave"` socket.SendReceiveTimeout - family int // Read-only. - fd int // must be O_NONBLOCK - queue waiter.Queue + family int // Read-only. + stype linux.SockType // Read-only. + protocol int // Read-only. + fd int // must be O_NONBLOCK + queue waiter.Queue } var _ = socket.Socket(&socketOperations{}) -func newSocketFile(ctx context.Context, family int, fd int, nonblock bool) (*fs.File, *syserr.Error) { - s := &socketOperations{family: family, fd: fd} +func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { + s := &socketOperations{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + } if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } @@ -221,7 +229,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) } - f, err := newSocketFile(t, s.family, fd, flags&syscall.SOCK_NONBLOCK != 0) + f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) if err != nil { syscall.Close(fd) return 0, nil, 0, err @@ -232,7 +240,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, } kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits()) - t.Kernel().RecordSocket(f, s.family) + t.Kernel().RecordSocket(f) return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } @@ -519,12 +527,39 @@ func translateIOSyscallError(err error) error { return err } +// State implements socket.Socket.State. +func (s *socketOperations) State() uint32 { + info := linux.TCPInfo{} + buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo) + if err != nil { + if err != syscall.ENOPROTOOPT { + log.Warningf("Failed to get TCP socket info from %+v: %v", s, err) + } + // For non-TCP sockets, silently ignore the failure. + return 0 + } + if len(buf) != linux.SizeOfTCPInfo { + // Unmarshal below will panic if getsockopt returns a buffer of + // unexpected size. + log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo) + return 0 + } + + binary.Unmarshal(buf, usermem.ByteOrder, &info) + return uint32(info.State) +} + +// Type implements socket.Socket.Type. +func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { + return s.family, s.stype, s.protocol +} + type socketProvider struct { family int } // Socket implements socket.Provider.Socket. -func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Check that we are using the host network stack. stack := t.NetworkContext() if stack == nil { @@ -535,7 +570,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p } // Only accept TCP and UDP. - stype := int(stypeflags) & linux.SOCK_TYPE_MASK + stype := stypeflags & linux.SOCK_TYPE_MASK switch stype { case syscall.SOCK_STREAM: switch protocol { @@ -558,15 +593,15 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p // Conservatively ignore all flags specified by the application and add // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. - fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) if err != nil { return nil, syserr.FromError(err) } - return newSocketFile(t, p.family, fd, stypeflags&syscall.SOCK_NONBLOCK != 0) + return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&syscall.SOCK_NONBLOCK != 0) } // Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 76cf12fd4..5dc103877 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -22,7 +22,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/socket" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/syserr" ) @@ -66,10 +65,10 @@ type socketProvider struct { } // Socket implements socket.Provider.Socket. -func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Netlink sockets must be specified as datagram or raw, but they // behave the same regardless of type. - if stype != transport.SockDgram && stype != transport.SockRaw { + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { return nil, syserr.ErrSocketNotSupported } @@ -83,7 +82,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol return nil, err } - s, err := NewSocket(t, p) + s, err := NewSocket(t, stype, p) if err != nil { return nil, err } @@ -94,7 +93,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol } // Pair implements socket.Provider.Pair by returning an error. -func (*socketProvider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) { +func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { // Netlink sockets never supports creating socket pairs. return nil, nil, syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index afd06ca33..62659784a 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -80,6 +80,10 @@ type Socket struct { // protocol is the netlink protocol implementation. protocol Protocol + // skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for + // netlink sockets. + skType linux.SockType + // ep is a datagram unix endpoint used to buffer messages sent from the // kernel to userspace. RecvMsg reads messages from this endpoint. ep transport.Endpoint @@ -105,7 +109,7 @@ type Socket struct { var _ socket.Socket = (*Socket)(nil) // NewSocket creates a new Socket. -func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) { +func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) { // Datagram endpoint used to buffer kernel -> user messages. ep := transport.NewConnectionless() @@ -126,6 +130,7 @@ func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) { return &Socket{ ports: t.Kernel().NetlinkPorts(), protocol: protocol, + skType: skType, ep: ep, connection: connection, sendBufferSize: defaultSendBufferSize, @@ -616,3 +621,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } + +// State implements socket.Socket.State. +func (s *Socket) State() uint32 { + return s.ep.State() +} + +// Type implements socket.Socket.Type. +func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { + return linux.AF_NETLINK, s.skType, s.protocol.Protocol() +} diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD index 4da14a1e0..33ba20de7 100644 --- a/pkg/sentry/socket/rpcinet/BUILD +++ b/pkg/sentry/socket/rpcinet/BUILD @@ -31,7 +31,6 @@ go_library( "//pkg/sentry/socket/hostinet", "//pkg/sentry/socket/rpcinet/conn", "//pkg/sentry/socket/rpcinet/notifier", - "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", "//pkg/syserr", diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go index 55e0b6665..c22ff1ff0 100644 --- a/pkg/sentry/socket/rpcinet/socket.go +++ b/pkg/sentry/socket/rpcinet/socket.go @@ -32,7 +32,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn" "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier" pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" - "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" "gvisor.googlesource.com/gvisor/pkg/sentry/unimpl" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserr" @@ -54,7 +53,10 @@ type socketOperations struct { fsutil.FileUseInodeUnstableAttr `state:"nosave"` socket.SendReceiveTimeout - family int // Read-only. + family int // Read-only. + stype linux.SockType // Read-only. + protocol int // Read-only. + fd uint32 // must be O_NONBLOCK wq *waiter.Queue rpcConn *conn.RPCConnection @@ -70,7 +72,7 @@ type socketOperations struct { var _ = socket.Socket(&socketOperations{}) // New creates a new RPC socket. -func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) { +func newSocketFile(ctx context.Context, stack *Stack, family int, skType linux.SockType, protocol int) (*fs.File, *syserr.Error) { id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */) <-c @@ -87,6 +89,8 @@ func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, pr defer dirent.DecRef() return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{ family: family, + stype: skType, + protocol: protocol, wq: &wq, fd: fd, rpcConn: stack.rpcConn, @@ -333,7 +337,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, if err != nil { return 0, nil, 0, syserr.FromError(err) } - t.Kernel().RecordSocket(file, s.family) + t.Kernel().RecordSocket(file) if peerRequested { return fd, payload.Address.Address, payload.Address.Length, nil @@ -830,12 +834,23 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } } +// State implements socket.Socket.State. +func (s *socketOperations) State() uint32 { + // TODO(b/127845868): Define a new rpc to query the socket state. + return 0 +} + +// Type implements socket.Socket.Type. +func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { + return s.family, s.stype, s.protocol +} + type socketProvider struct { family int } // Socket implements socket.Provider.Socket. -func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Check that we are using the RPC network stack. stack := t.NetworkContext() if stack == nil { @@ -851,7 +866,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p // // Try to restrict the flags we will accept to minimize backwards // incompatibility with netstack. - stype := int(stypeflags) & linux.SOCK_TYPE_MASK + stype := stypeflags & linux.SOCK_TYPE_MASK switch stype { case syscall.SOCK_STREAM: switch protocol { @@ -871,11 +886,11 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p return nil, nil } - return newSocketFile(t, s, p.family, stype, 0) + return newSocketFile(t, s, p.family, stype, protocol) } // Pair implements socket.Provider.Pair. -func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { // Not supported by AF_INET/AF_INET6. return nil, nil, nil } diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 9393acd28..d60944b6b 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -116,6 +116,13 @@ type Socket interface { // SendTimeout gets the current timeout (in ns) for send operations. Zero // means no timeout, and negative means DONTWAIT. SendTimeout() int64 + + // State returns the current state of the socket, as represented by Linux in + // procfs. The returned state value is protocol-specific. + State() uint32 + + // Type returns the family, socket type and protocol of the socket. + Type() (family int, skType linux.SockType, protocol int) } // Provider is the interface implemented by providers of sockets for specific @@ -126,12 +133,12 @@ type Provider interface { // If a nil Socket _and_ a nil error is returned, it means that the // protocol is not supported. A non-nil error should only be returned // if the protocol is supported, but an error occurs during creation. - Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) + Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) // Pair creates a pair of connected sockets. // // See Socket for error information. - Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) + Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) } // families holds a map of all known address families and their providers. @@ -145,14 +152,14 @@ func RegisterProvider(family int, provider Provider) { } // New creates a new socket with the given family, type and protocol. -func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { for _, p := range families[family] { s, err := p.Socket(t, stype, protocol) if err != nil { return nil, err } if s != nil { - t.Kernel().RecordSocket(s, family) + t.Kernel().RecordSocket(s) return s, nil } } @@ -162,7 +169,7 @@ func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*f // Pair creates a new connected socket pair with the given family, type and // protocol. -func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { providers, ok := families[family] if !ok { return nil, nil, syserr.ErrAddressFamilyNotSupported @@ -175,8 +182,8 @@ func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (* } if s1 != nil && s2 != nil { k := t.Kernel() - k.RecordSocket(s1, family) - k.RecordSocket(s2, family) + k.RecordSocket(s1) + k.RecordSocket(s2) return s1, s2, nil } } diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 5a2de0c4c..52f324eed 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -28,6 +28,7 @@ go_library( importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport", visibility = ["//:sandbox"], deps = [ + "//pkg/abi/linux", "//pkg/ilist", "//pkg/refs", "//pkg/syserr", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 18e492862..db79ac904 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -17,6 +17,7 @@ package transport import ( "sync" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/waiter" @@ -44,7 +45,7 @@ type ConnectingEndpoint interface { // Type returns the socket type, typically either SockStream or // SockSeqpacket. The connection attempt must be aborted if this // value doesn't match the ConnectableEndpoint's type. - Type() SockType + Type() linux.SockType // GetLocalAddress returns the bound path. GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) @@ -100,7 +101,7 @@ type connectionedEndpoint struct { // stype is used by connecting sockets to ensure that they are the // same type. The value is typically either tcpip.SockSeqpacket or // tcpip.SockStream. - stype SockType + stype linux.SockType // acceptedChan is per the TCP endpoint implementation. Note that the // sockets in this channel are _already in the connected state_, and @@ -111,7 +112,7 @@ type connectionedEndpoint struct { } // NewConnectioned creates a new unbound connectionedEndpoint. -func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint { +func NewConnectioned(stype linux.SockType, uid UniqueIDProvider) Endpoint { return &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, id: uid.UniqueID(), @@ -121,7 +122,7 @@ func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint { } // NewPair allocates a new pair of connected unix-domain connectionedEndpoints. -func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { +func NewPair(stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { a := &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}}, id: uid.UniqueID(), @@ -138,7 +139,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} - if stype == SockStream { + if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}} } else { @@ -162,7 +163,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) { // NewExternal creates a new externally backed Endpoint. It behaves like a // socketpair. -func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { +func NewExternal(stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint { return &connectionedEndpoint{ baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected}, id: uid.UniqueID(), @@ -177,7 +178,7 @@ func (e *connectionedEndpoint) ID() uint64 { } // Type implements ConnectingEndpoint.Type and Endpoint.Type. -func (e *connectionedEndpoint) Type() SockType { +func (e *connectionedEndpoint) Type() linux.SockType { return e.stype } @@ -293,7 +294,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} - if e.stype == SockStream { + if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { ne.receiver = &queueReceiver{readQueue: writeQueue} @@ -308,7 +309,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur writeQueue: writeQueue, } readQueue.IncRef() - if e.stype == SockStream { + if e.stype == linux.SOCK_STREAM { returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected) } else { returnConnect(&queueReceiver{readQueue: readQueue}, connected) @@ -428,7 +429,7 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syser func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) { // Stream sockets do not support specifying the endpoint. Seqpacket // sockets ignore the passed endpoint. - if e.stype == SockStream && to != nil { + if e.stype == linux.SOCK_STREAM && to != nil { return 0, syserr.ErrNotSupported } return e.baseEndpoint.SendMsg(data, c, to) @@ -458,3 +459,11 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask return ready } + +// State implements socket.Socket.State. +func (e *connectionedEndpoint) State() uint32 { + if e.Connected() { + return linux.SS_CONNECTED + } + return linux.SS_UNCONNECTED +} diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 43ff875e4..81ebfba10 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -15,6 +15,7 @@ package transport import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/waiter" @@ -118,8 +119,8 @@ func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to Bo } // Type implements Endpoint.Type. -func (e *connectionlessEndpoint) Type() SockType { - return SockDgram +func (e *connectionlessEndpoint) Type() linux.SockType { + return linux.SOCK_DGRAM } // Connect attempts to connect directly to server. @@ -194,3 +195,18 @@ func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMa return ready } + +// State implements socket.Socket.State. +func (e *connectionlessEndpoint) State() uint32 { + e.Lock() + defer e.Unlock() + + switch { + case e.isBound(): + return linux.SS_UNCONNECTED + case e.Connected(): + return linux.SS_CONNECTING + default: + return linux.SS_DISCONNECTING + } +} diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index b734b4c20..5c55c529e 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -19,6 +19,7 @@ import ( "sync" "sync/atomic" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/syserr" "gvisor.googlesource.com/gvisor/pkg/tcpip" "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer" @@ -28,21 +29,6 @@ import ( // initialLimit is the starting limit for the socket buffers. const initialLimit = 16 * 1024 -// A SockType is a type (as opposed to family) of sockets. These are enumerated -// in the syscall package as syscall.SOCK_* constants. -type SockType int - -const ( - // SockStream corresponds to syscall.SOCK_STREAM. - SockStream SockType = 1 - // SockDgram corresponds to syscall.SOCK_DGRAM. - SockDgram SockType = 2 - // SockRaw corresponds to syscall.SOCK_RAW. - SockRaw SockType = 3 - // SockSeqpacket corresponds to syscall.SOCK_SEQPACKET. - SockSeqpacket SockType = 5 -) - // A RightsControlMessage is a control message containing FDs. type RightsControlMessage interface { // Clone returns a copy of the RightsControlMessage. @@ -175,7 +161,7 @@ type Endpoint interface { // Type return the socket type, typically either SockStream, SockDgram // or SockSeqpacket. - Type() SockType + Type() linux.SockType // GetLocalAddress returns the address to which the endpoint is bound. GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) @@ -191,6 +177,10 @@ type Endpoint interface { // GetSockOpt gets a socket option. opt should be a pointer to one of the // tcpip.*Option types. GetSockOpt(opt interface{}) *tcpip.Error + + // State returns the current state of the socket, as represented by Linux in + // procfs. + State() uint32 } // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket @@ -237,6 +227,10 @@ type BoundEndpoint interface { // endpoint. UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error) + // Passcred returns whether or not the SO_PASSCRED socket option is + // enabled on this end. + Passcred() bool + // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. @@ -621,7 +615,7 @@ type connectedEndpoint struct { GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) // Type implements Endpoint.Type. - Type() SockType + Type() linux.SockType } writeQueue *queue @@ -645,7 +639,7 @@ func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, } truncate := false - if e.endpoint.Type() == SockStream { + if e.endpoint.Type() == linux.SOCK_STREAM { // Since stream sockets don't preserve message boundaries, we // can write only as much of the message as fits in the queue. truncate = true diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 1414be0c6..b07e8d67b 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -17,6 +17,7 @@ package unix import ( + "fmt" "strings" "syscall" @@ -55,22 +56,22 @@ type SocketOperations struct { refs.AtomicRefCount socket.SendReceiveTimeout - ep transport.Endpoint - isPacket bool + ep transport.Endpoint + stype linux.SockType } // New creates a new unix socket. -func New(ctx context.Context, endpoint transport.Endpoint, isPacket bool) *fs.File { +func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File { dirent := socket.NewDirent(ctx, unixSocketDevice) defer dirent.DecRef() - return NewWithDirent(ctx, dirent, endpoint, isPacket, fs.FileFlags{Read: true, Write: true}) + return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true}) } // NewWithDirent creates a new unix socket using an existing dirent. -func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, isPacket bool, flags fs.FileFlags) *fs.File { +func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File { return fs.NewFile(ctx, d, flags, &SocketOperations{ - ep: ep, - isPacket: isPacket, + ep: ep, + stype: stype, }) } @@ -88,6 +89,18 @@ func (s *SocketOperations) Release() { s.DecRef() } +func (s *SocketOperations) isPacket() bool { + switch s.stype { + case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: + return true + case linux.SOCK_STREAM: + return false + default: + // We shouldn't have allowed any other socket types during creation. + panic(fmt.Sprintf("Invalid socket type %d", s.stype)) + } +} + // Endpoint extracts the transport.Endpoint. func (s *SocketOperations) Endpoint() transport.Endpoint { return s.ep @@ -193,7 +206,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } } - ns := New(t, ep, s.isPacket) + ns := New(t, ep, s.stype) defer ns.DecRef() if flags&linux.SOCK_NONBLOCK != 0 { @@ -221,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, return 0, nil, 0, syserr.FromError(e) } - t.Kernel().RecordSocket(ns, linux.AF_UNIX) + t.Kernel().RecordSocket(ns) return fd, addr, addrLen, nil } @@ -385,6 +398,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } defer ep.Release() w.To = ep + + if ep.Passcred() && w.Control.Credentials == nil { + w.Control.Credentials = control.MakeCreds(t) + } } n, err := src.CopyInTo(t, &w) @@ -483,6 +500,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 waitAll := flags&linux.MSG_WAITALL != 0 + isPacket := s.isPacket() // Calculate the number of FDs for which we have space and if we are // requesting credentials. @@ -516,7 +534,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait { var from interface{} var fromLen uint32 - if r.From != nil { + if r.From != nil && len([]byte(r.From.Addr)) != 0 { from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From) } @@ -524,8 +542,8 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags msgFlags |= linux.MSG_CTRUNC } - if err != nil || dontWait || !waitAll || s.isPacket || n >= dst.NumBytes() { - if s.isPacket && n < int64(r.MsgSize) { + if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() { + if isPacket && n < int64(r.MsgSize) { msgFlags |= linux.MSG_TRUNC } @@ -566,11 +584,11 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags total += n } - if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() { + if err != nil || !waitAll || isPacket || n >= dst.NumBytes() { if total > 0 { err = nil } - if s.isPacket && n < int64(r.MsgSize) { + if isPacket && n < int64(r.MsgSize) { msgFlags |= linux.MSG_TRUNC } return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err) @@ -592,11 +610,22 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } } +// State implements socket.Socket.State. +func (s *SocketOperations) State() uint32 { + return s.ep.State() +} + +// Type implements socket.Socket.Type. +func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { + // Unix domain sockets always have a protocol of 0. + return linux.AF_UNIX, s.stype, 0 +} + // provider is a unix domain socket provider. type provider struct{} // Socket returns a new unix domain socket. -func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) { +func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) { // Check arguments. if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { return nil, syserr.ErrProtocolNotSupported @@ -604,43 +633,36 @@ func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) // Create the endpoint and socket. var ep transport.Endpoint - var isPacket bool switch stype { case linux.SOCK_DGRAM: - isPacket = true ep = transport.NewConnectionless() - case linux.SOCK_SEQPACKET: - isPacket = true - fallthrough - case linux.SOCK_STREAM: + case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: ep = transport.NewConnectioned(stype, t.Kernel()) default: return nil, syserr.ErrInvalidArgument } - return New(t, ep, isPacket), nil + return New(t, ep, stype), nil } // Pair creates a new pair of AF_UNIX connected sockets. -func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { +func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { // Check arguments. if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { return nil, nil, syserr.ErrProtocolNotSupported } - var isPacket bool switch stype { - case linux.SOCK_STREAM: - case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: - isPacket = true + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: + // Ok default: return nil, nil, syserr.ErrInvalidArgument } // Create the endpoints and sockets. ep1, ep2 := transport.NewPair(stype, t.Kernel()) - s1 := New(t, ep1, isPacket) - s2 := New(t, ep2, isPacket) + s1 := New(t, ep1, stype) + s2 := New(t, ep2, stype) return s1, s2, nil } diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index dbe53b9a2..0b5ef84c4 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -76,13 +76,13 @@ var SocketFamily = abi.ValueSet{ // SocketType are the possible socket(2) types. var SocketType = abi.ValueSet{ - linux.SOCK_STREAM: "SOCK_STREAM", - linux.SOCK_DGRAM: "SOCK_DGRAM", - linux.SOCK_RAW: "SOCK_RAW", - linux.SOCK_RDM: "SOCK_RDM", - linux.SOCK_SEQPACKET: "SOCK_SEQPACKET", - linux.SOCK_DCCP: "SOCK_DCCP", - linux.SOCK_PACKET: "SOCK_PACKET", + uint64(linux.SOCK_STREAM): "SOCK_STREAM", + uint64(linux.SOCK_DGRAM): "SOCK_DGRAM", + uint64(linux.SOCK_RAW): "SOCK_RAW", + uint64(linux.SOCK_RDM): "SOCK_RDM", + uint64(linux.SOCK_SEQPACKET): "SOCK_SEQPACKET", + uint64(linux.SOCK_DCCP): "SOCK_DCCP", + uint64(linux.SOCK_PACKET): "SOCK_PACKET", } // SocketFlagSet are the possible socket(2) flags. diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index f76989ae2..1c057526b 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -19,6 +19,7 @@ go_library( "sys_identity.go", "sys_inotify.go", "sys_lseek.go", + "sys_mempolicy.go", "sys_mmap.go", "sys_mount.go", "sys_pipe.go", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 1ba3695fb..72146ea63 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -92,6 +92,10 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should // also be sent to the application. return nil + case syserror.ECONNRESET: + // For TCP sendfile connections, we may have a reset. But we + // should just return n as the result. + return nil case syserror.ErrWouldBlock: // Syscall would block, but completed a partial read/write. // This case should only be returned by IssueIO for nonblocking diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 3e4d312af..5251c2463 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -34,33 +34,6 @@ const _AUDIT_ARCH_X86_64 = 0xc000003e // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall // numbers from Linux 4.4. The entries commented out are those syscalls we // don't currently support. -// -// Syscall support is documented as annotations in Go comments of the form: -// @Syscall(<name>, <key:value>, ...) -// -// Supported args and values are: -// -// - arg: A syscall option. This entry only applies to the syscall when given -// this option. -// - support: Indicates support level -// - UNIMPLEMENTED: Unimplemented (default, implies returns:ENOSYS) -// - PARTIAL: Partial support. Details should be provided in note. -// - FULL: Full support -// - returns: Indicates a known return value. Values are syscall errors. This -// is treated as a string so you can use something like -// "returns:EPERM or ENOSYS". -// - issue: A Github issue number. -// - note: A note -// -// Example: -// // @Syscall(mmap, arg:MAP_PRIVATE, support:FULL, note:Private memory fully supported) -// // @Syscall(mmap, arg:MAP_SHARED, issue:123, note:Shared memory not supported) -// // @Syscall(setxattr, returns:ENOTSUP, note:Requires file system support) -// -// Annotations should be placed as close to their implementation as possible -// (preferrably as part of a supporting function's Godoc) and should be -// updated as syscall support changes. Unimplemented syscalls are documented -// here due to their lack of a supporting function or method. var AMD64 = &kernel.SyscallTable{ OS: abi.Linux, Arch: arch.AMD64, @@ -74,405 +47,338 @@ var AMD64 = &kernel.SyscallTable{ Version: "#1 SMP Sun Jan 10 15:06:54 PST 2016", }, AuditNumber: _AUDIT_ARCH_X86_64, - Table: map[uintptr]kernel.SyscallFn{ - 0: Read, - 1: Write, - 2: Open, - 3: Close, - 4: Stat, - 5: Fstat, - 6: Lstat, - 7: Poll, - 8: Lseek, - 9: Mmap, - 10: Mprotect, - 11: Munmap, - 12: Brk, - 13: RtSigaction, - 14: RtSigprocmask, - 15: RtSigreturn, - 16: Ioctl, - 17: Pread64, - 18: Pwrite64, - 19: Readv, - 20: Writev, - 21: Access, - 22: Pipe, - 23: Select, - 24: SchedYield, - 25: Mremap, - 26: Msync, - 27: Mincore, - 28: Madvise, - 29: Shmget, - 30: Shmat, - 31: Shmctl, - 32: Dup, - 33: Dup2, - 34: Pause, - 35: Nanosleep, - 36: Getitimer, - 37: Alarm, - 38: Setitimer, - 39: Getpid, - 40: Sendfile, - 41: Socket, - 42: Connect, - 43: Accept, - 44: SendTo, - 45: RecvFrom, - 46: SendMsg, - 47: RecvMsg, - 48: Shutdown, - 49: Bind, - 50: Listen, - 51: GetSockName, - 52: GetPeerName, - 53: SocketPair, - 54: SetSockOpt, - 55: GetSockOpt, - 56: Clone, - 57: Fork, - 58: Vfork, - 59: Execve, - 60: Exit, - 61: Wait4, - 62: Kill, - 63: Uname, - 64: Semget, - 65: Semop, - 66: Semctl, - 67: Shmdt, - // 68: @Syscall(Msgget), TODO(b/29354921) - // 69: @Syscall(Msgsnd), TODO(b/29354921) - // 70: @Syscall(Msgrcv), TODO(b/29354921) - // 71: @Syscall(Msgctl), TODO(b/29354921) - 72: Fcntl, - 73: Flock, - 74: Fsync, - 75: Fdatasync, - 76: Truncate, - 77: Ftruncate, - 78: Getdents, - 79: Getcwd, - 80: Chdir, - 81: Fchdir, - 82: Rename, - 83: Mkdir, - 84: Rmdir, - 85: Creat, - 86: Link, - 87: Unlink, - 88: Symlink, - 89: Readlink, - 90: Chmod, - 91: Fchmod, - 92: Chown, - 93: Fchown, - 94: Lchown, - 95: Umask, - 96: Gettimeofday, - 97: Getrlimit, - 98: Getrusage, - 99: Sysinfo, - 100: Times, - 101: Ptrace, - 102: Getuid, - 103: Syslog, - 104: Getgid, - 105: Setuid, - 106: Setgid, - 107: Geteuid, - 108: Getegid, - 109: Setpgid, - 110: Getppid, - 111: Getpgrp, - 112: Setsid, - 113: Setreuid, - 114: Setregid, - 115: Getgroups, - 116: Setgroups, - 117: Setresuid, - 118: Getresuid, - 119: Setresgid, - 120: Getresgid, - 121: Getpgid, - // 122: @Syscall(Setfsuid), TODO(b/112851702) - // 123: @Syscall(Setfsgid), TODO(b/112851702) - 124: Getsid, - 125: Capget, - 126: Capset, - 127: RtSigpending, - 128: RtSigtimedwait, - 129: RtSigqueueinfo, - 130: RtSigsuspend, - 131: Sigaltstack, - 132: Utime, - 133: Mknod, - // @Syscall(Uselib, note:Obsolete) - 134: syscalls.Error(syscall.ENOSYS), - // @Syscall(SetPersonality, returns:EINVAL, note:Unable to change personality) - 135: syscalls.ErrorWithEvent(syscall.EINVAL), - // @Syscall(Ustat, note:Needs filesystem support) - 136: syscalls.ErrorWithEvent(syscall.ENOSYS), - 137: Statfs, - 138: Fstatfs, - // 139: @Syscall(Sysfs), TODO(gvisor.dev/issue/165) - 140: Getpriority, - 141: Setpriority, - // @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise) - 142: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice - 143: SchedGetparam, - 144: SchedSetscheduler, - 145: SchedGetscheduler, - 146: SchedGetPriorityMax, - 147: SchedGetPriorityMin, - // @Syscall(SchedRrGetInterval, returns:EPERM) - 148: syscalls.ErrorWithEvent(syscall.EPERM), - 149: Mlock, - 150: Munlock, - 151: Mlockall, - 152: Munlockall, - // @Syscall(Vhangup, returns:EPERM) - 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), - // @Syscall(ModifyLdt, returns:EPERM) - 154: syscalls.Error(syscall.EPERM), - // @Syscall(PivotRoot, returns:EPERM) - 155: syscalls.Error(syscall.EPERM), - // @Syscall(Sysctl, returns:EPERM) - 156: syscalls.Error(syscall.EPERM), // syscall is "worthless" - 157: Prctl, - 158: ArchPrctl, - // @Syscall(Adjtimex, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise) - 159: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time - 160: Setrlimit, - 161: Chroot, - 162: Sync, - // @Syscall(Acct, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_pacct; ENOSYS otherwise) - 163: syscalls.CapError(linux.CAP_SYS_PACCT), // requires cap_sys_pacct - // @Syscall(Settimeofday, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise) - 164: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time - 165: Mount, - 166: Umount2, - // @Syscall(Swapon, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 167: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin - // @Syscall(Swapoff, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 168: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin - // @Syscall(Reboot, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise) - 169: syscalls.CapError(linux.CAP_SYS_BOOT), // requires cap_sys_boot - 170: Sethostname, - 171: Setdomainname, - // @Syscall(Iopl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise) - 172: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio - // @Syscall(Ioperm, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise) - 173: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio - // @Syscall(CreateModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module - // @Syscall(InitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 175: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module - // @Syscall(DeleteModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 176: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module - // @Syscall(GetKernelSyms, note:Not supported in > 2.6) - 177: syscalls.Error(syscall.ENOSYS), - // @Syscall(QueryModule, note:Not supported in > 2.6) - 178: syscalls.Error(syscall.ENOSYS), - // @Syscall(Quotactl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 179: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin (most operations) - // @Syscall(Nfsservctl, note:Does not exist > 3.1) - 180: syscalls.Error(syscall.ENOSYS), - // @Syscall(Getpmsg, note:Not implemented in Linux) - 181: syscalls.Error(syscall.ENOSYS), - // @Syscall(Putpmsg, note:Not implemented in Linux) - 182: syscalls.Error(syscall.ENOSYS), - // @Syscall(AfsSyscall, note:Not implemented in Linux) - 183: syscalls.Error(syscall.ENOSYS), - // @Syscall(Tuxcall, note:Not implemented in Linux) - 184: syscalls.Error(syscall.ENOSYS), - // @Syscall(Security, note:Not implemented in Linux) - 185: syscalls.Error(syscall.ENOSYS), - 186: Gettid, - 187: nil, // @Syscall(Readahead), TODO(b/29351341) - // @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support) - 188: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support) - 189: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Fsetxattr, returns:ENOTSUP, note:Requires filesystem support) - 190: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Getxattr, returns:ENOTSUP, note:Requires filesystem support) - 191: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Lgetxattr, returns:ENOTSUP, note:Requires filesystem support) - 192: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Fgetxattr, returns:ENOTSUP, note:Requires filesystem support) - 193: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Listxattr, returns:ENOTSUP, note:Requires filesystem support) - 194: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Llistxattr, returns:ENOTSUP, note:Requires filesystem support) - 195: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Flistxattr, returns:ENOTSUP, note:Requires filesystem support) - 196: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Removexattr, returns:ENOTSUP, note:Requires filesystem support) - 197: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Lremovexattr, returns:ENOTSUP, note:Requires filesystem support) - 198: syscalls.ErrorWithEvent(syscall.ENOTSUP), - // @Syscall(Fremovexattr, returns:ENOTSUP, note:Requires filesystem support) - 199: syscalls.ErrorWithEvent(syscall.ENOTSUP), - 200: Tkill, - 201: Time, - 202: Futex, - 203: SchedSetaffinity, - 204: SchedGetaffinity, - // @Syscall(SetThreadArea, note:Expected to return ENOSYS on 64-bit) - 205: syscalls.Error(syscall.ENOSYS), - 206: IoSetup, - 207: IoDestroy, - 208: IoGetevents, - 209: IoSubmit, - 210: IoCancel, - // @Syscall(GetThreadArea, note:Expected to return ENOSYS on 64-bit) - 211: syscalls.Error(syscall.ENOSYS), - // @Syscall(LookupDcookie, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 212: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin - 213: EpollCreate, - // @Syscall(EpollCtlOld, note:Deprecated) - 214: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused) - // @Syscall(EpollWaitOld, note:Deprecated) - 215: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused) - // @Syscall(RemapFilePages, note:Deprecated) - 216: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated since 3.16 - 217: Getdents64, - 218: SetTidAddress, - 219: RestartSyscall, - // 220: @Syscall(Semtimedop), TODO(b/29354920) - 221: Fadvise64, - 222: TimerCreate, - 223: TimerSettime, - 224: TimerGettime, - 225: TimerGetoverrun, - 226: TimerDelete, - 227: ClockSettime, - 228: ClockGettime, - 229: ClockGetres, - 230: ClockNanosleep, - 231: ExitGroup, - 232: EpollWait, - 233: EpollCtl, - 234: Tgkill, - 235: Utimes, - // @Syscall(Vserver, note:Not implemented by Linux) - 236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux - // @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295) - 237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice - 238: SetMempolicy, - 239: GetMempolicy, - // 240: @Syscall(MqOpen), TODO(b/29354921) - // 241: @Syscall(MqUnlink), TODO(b/29354921) - // 242: @Syscall(MqTimedsend), TODO(b/29354921) - // 243: @Syscall(MqTimedreceive), TODO(b/29354921) - // 244: @Syscall(MqNotify), TODO(b/29354921) - // 245: @Syscall(MqGetsetattr), TODO(b/29354921) - 246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot - 247: Waitid, - // @Syscall(AddKey, returns:EACCES, note:Not available to user) - 248: syscalls.Error(syscall.EACCES), - // @Syscall(RequestKey, returns:EACCES, note:Not available to user) - 249: syscalls.Error(syscall.EACCES), - // @Syscall(Keyctl, returns:EACCES, note:Not available to user) - 250: syscalls.Error(syscall.EACCES), - // @Syscall(IoprioSet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 251: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending) - // @Syscall(IoprioGet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise) - 252: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending) - 253: InotifyInit, - 254: InotifyAddWatch, - 255: InotifyRmWatch, - // @Syscall(MigratePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise) - 256: syscalls.CapError(linux.CAP_SYS_NICE), - 257: Openat, - 258: Mkdirat, - 259: Mknodat, - 260: Fchownat, - 261: Futimesat, - 262: Fstatat, - 263: Unlinkat, - 264: Renameat, - 265: Linkat, - 266: Symlinkat, - 267: Readlinkat, - 268: Fchmodat, - 269: Faccessat, - 270: Pselect, - 271: Ppoll, - 272: Unshare, - // @Syscall(SetRobustList, note:Obsolete) - 273: syscalls.Error(syscall.ENOSYS), - // @Syscall(GetRobustList, note:Obsolete) - 274: syscalls.Error(syscall.ENOSYS), - 275: Splice, - // 276: @Syscall(Tee), TODO(b/29354098) - 277: SyncFileRange, - // 278: @Syscall(Vmsplice), TODO(b/29354098) - // @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise) - 279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly) - 280: Utimensat, - 281: EpollPwait, - // 282: @Syscall(Signalfd), TODO(b/19846426) - 283: TimerfdCreate, - 284: Eventfd, - 285: Fallocate, - 286: TimerfdSettime, - 287: TimerfdGettime, - 288: Accept4, - // 289: @Syscall(Signalfd4), TODO(b/19846426) - 290: Eventfd2, - 291: EpollCreate1, - 292: Dup3, - 293: Pipe2, - 294: InotifyInit1, - 295: Preadv, - 296: Pwritev, - 297: RtTgsigqueueinfo, - // @Syscall(PerfEventOpen, returns:ENODEV, note:No support for perf counters) - 298: syscalls.ErrorWithEvent(syscall.ENODEV), - 299: RecvMMsg, - // @Syscall(FanotifyInit, note:Needs CONFIG_FANOTIFY) - 300: syscalls.ErrorWithEvent(syscall.ENOSYS), - // @Syscall(FanotifyMark, note:Needs CONFIG_FANOTIFY) - 301: syscalls.ErrorWithEvent(syscall.ENOSYS), - 302: Prlimit64, - // @Syscall(NameToHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support) - 303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), - // @Syscall(OpenByHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support) - 304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP), - // @Syscall(ClockAdjtime, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time - 306: Syncfs, - 307: SendMMsg, - // 308: @Syscall(Setns), TODO(b/29354995) - 309: Getcpu, - // 310: @Syscall(ProcessVmReadv), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace - // 311: @Syscall(ProcessVmWritev), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace - // @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace) - 312: syscalls.CapError(linux.CAP_SYS_PTRACE), - // @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise) - 313: syscalls.CapError(linux.CAP_SYS_MODULE), - // 314: @Syscall(SchedSetattr), TODO(b/118902272), we have no scheduler - // 315: @Syscall(SchedGetattr), TODO(b/118902272), we have no scheduler - // 316: @Syscall(Renameat2), TODO(b/118902772) - 317: Seccomp, - 318: GetRandom, - 319: MemfdCreate, - // @Syscall(KexecFileLoad, EPERM or ENOSYS, note:Infeasible to support. Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise) - 320: syscalls.CapError(linux.CAP_SYS_BOOT), - // @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise) - 321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands - // 322: @Syscall(Execveat), TODO(b/118901836) - // 323: @Syscall(Userfaultfd), TODO(b/118906345) - // 324: @Syscall(Membarrier), TODO(b/118904897) - 325: Mlock2, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.Supported("read", Read), + 1: syscalls.Supported("write", Write), + 2: syscalls.Supported("open", Open), + 3: syscalls.Supported("close", Close), + 4: syscalls.Undocumented("stat", Stat), + 5: syscalls.Undocumented("fstat", Fstat), + 6: syscalls.Undocumented("lstat", Lstat), + 7: syscalls.Undocumented("poll", Poll), + 8: syscalls.Undocumented("lseek", Lseek), + 9: syscalls.Undocumented("mmap", Mmap), + 10: syscalls.Undocumented("mprotect", Mprotect), + 11: syscalls.Undocumented("munmap", Munmap), + 12: syscalls.Undocumented("brk", Brk), + 13: syscalls.Undocumented("rt_sigaction", RtSigaction), + 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn), + 16: syscalls.Undocumented("ioctl", Ioctl), + 17: syscalls.Undocumented("pread64", Pread64), + 18: syscalls.Undocumented("pwrite64", Pwrite64), + 19: syscalls.Undocumented("readv", Readv), + 20: syscalls.Undocumented("writev", Writev), + 21: syscalls.Undocumented("access", Access), + 22: syscalls.Undocumented("pipe", Pipe), + 23: syscalls.Undocumented("select", Select), + 24: syscalls.Undocumented("sched_yield", SchedYield), + 25: syscalls.Undocumented("mremap", Mremap), + 26: syscalls.Undocumented("msync", Msync), + 27: syscalls.Undocumented("mincore", Mincore), + 28: syscalls.Undocumented("madvise", Madvise), + 29: syscalls.Undocumented("shmget", Shmget), + 30: syscalls.Undocumented("shmat", Shmat), + 31: syscalls.Undocumented("shmctl", Shmctl), + 32: syscalls.Undocumented("dup", Dup), + 33: syscalls.Undocumented("dup2", Dup2), + 34: syscalls.Undocumented("pause", Pause), + 35: syscalls.Undocumented("nanosleep", Nanosleep), + 36: syscalls.Undocumented("getitimer", Getitimer), + 37: syscalls.Undocumented("alarm", Alarm), + 38: syscalls.Undocumented("setitimer", Setitimer), + 39: syscalls.Undocumented("getpid", Getpid), + 40: syscalls.Undocumented("sendfile", Sendfile), + 41: syscalls.Undocumented("socket", Socket), + 42: syscalls.Undocumented("connect", Connect), + 43: syscalls.Undocumented("accept", Accept), + 44: syscalls.Undocumented("sendto", SendTo), + 45: syscalls.Undocumented("recvfrom", RecvFrom), + 46: syscalls.Undocumented("sendmsg", SendMsg), + 47: syscalls.Undocumented("recvmsg", RecvMsg), + 48: syscalls.Undocumented("shutdown", Shutdown), + 49: syscalls.Undocumented("bind", Bind), + 50: syscalls.Undocumented("listen", Listen), + 51: syscalls.Undocumented("getsockname", GetSockName), + 52: syscalls.Undocumented("getpeername", GetPeerName), + 53: syscalls.Undocumented("socketpair", SocketPair), + 54: syscalls.Undocumented("setsockopt", SetSockOpt), + 55: syscalls.Undocumented("getsockopt", GetSockOpt), + 56: syscalls.Undocumented("clone", Clone), + 57: syscalls.Undocumented("fork", Fork), + 58: syscalls.Undocumented("vfork", Vfork), + 59: syscalls.Undocumented("execve", Execve), + 60: syscalls.Undocumented("exit", Exit), + 61: syscalls.Undocumented("wait4", Wait4), + 62: syscalls.Undocumented("kill", Kill), + 63: syscalls.Undocumented("uname", Uname), + 64: syscalls.Undocumented("semget", Semget), + 65: syscalls.Undocumented("semop", Semop), + 66: syscalls.Undocumented("semctl", Semctl), + 67: syscalls.Undocumented("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.Undocumented("fcntl", Fcntl), + 73: syscalls.Undocumented("flock", Flock), + 74: syscalls.Undocumented("fsync", Fsync), + 75: syscalls.Undocumented("fdatasync", Fdatasync), + 76: syscalls.Undocumented("truncate", Truncate), + 77: syscalls.Undocumented("ftruncate", Ftruncate), + 78: syscalls.Undocumented("getdents", Getdents), + 79: syscalls.Undocumented("getcwd", Getcwd), + 80: syscalls.Undocumented("chdir", Chdir), + 81: syscalls.Undocumented("fchdir", Fchdir), + 82: syscalls.Undocumented("rename", Rename), + 83: syscalls.Undocumented("mkdir", Mkdir), + 84: syscalls.Undocumented("rmdir", Rmdir), + 85: syscalls.Undocumented("creat", Creat), + 86: syscalls.Undocumented("link", Link), + 87: syscalls.Undocumented("link", Unlink), + 88: syscalls.Undocumented("symlink", Symlink), + 89: syscalls.Undocumented("readlink", Readlink), + 90: syscalls.Undocumented("chmod", Chmod), + 91: syscalls.Undocumented("fchmod", Fchmod), + 92: syscalls.Undocumented("chown", Chown), + 93: syscalls.Undocumented("fchown", Fchown), + 94: syscalls.Undocumented("lchown", Lchown), + 95: syscalls.Undocumented("umask", Umask), + 96: syscalls.Undocumented("gettimeofday", Gettimeofday), + 97: syscalls.Undocumented("getrlimit", Getrlimit), + 98: syscalls.Undocumented("getrusage", Getrusage), + 99: syscalls.Undocumented("sysinfo", Sysinfo), + 100: syscalls.Undocumented("times", Times), + 101: syscalls.Undocumented("ptrace", Ptrace), + 102: syscalls.Undocumented("getuid", Getuid), + 103: syscalls.Undocumented("syslog", Syslog), + 104: syscalls.Undocumented("getgid", Getgid), + 105: syscalls.Undocumented("setuid", Setuid), + 106: syscalls.Undocumented("setgid", Setgid), + 107: syscalls.Undocumented("geteuid", Geteuid), + 108: syscalls.Undocumented("getegid", Getegid), + 109: syscalls.Undocumented("setpgid", Setpgid), + 110: syscalls.Undocumented("getppid", Getppid), + 111: syscalls.Undocumented("getpgrp", Getpgrp), + 112: syscalls.Undocumented("setsid", Setsid), + 113: syscalls.Undocumented("setreuid", Setreuid), + 114: syscalls.Undocumented("setregid", Setregid), + 115: syscalls.Undocumented("getgroups", Getgroups), + 116: syscalls.Undocumented("setgroups", Setgroups), + 117: syscalls.Undocumented("setresuid", Setresuid), + 118: syscalls.Undocumented("getresuid", Getresuid), + 119: syscalls.Undocumented("setresgid", Setresgid), + 120: syscalls.Undocumented("setresgid", Getresgid), + 121: syscalls.Undocumented("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Undocumented("getsid", Getsid), + 125: syscalls.Undocumented("capget", Capget), + 126: syscalls.Undocumented("capset", Capset), + 127: syscalls.Undocumented("rt_sigpending", RtSigpending), + 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Undocumented("sigaltstack", Sigaltstack), + 132: syscalls.Undocumented("utime", Utime), + 133: syscalls.Undocumented("mknod", Mknod), + 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.Undocumented("statfs", Statfs), + 138: syscalls.Undocumented("fstatfs", Fstatfs), + 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.Undocumented("getpriority", Getpriority), + 141: syscalls.Undocumented("setpriority", Setpriority), + 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 143: syscalls.Undocumented("sched_getparam", SchedGetparam), + 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler), + 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler), + 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax), + 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil), + 149: syscalls.Undocumented("mlock", Mlock), + 150: syscalls.Undocumented("munlock", Munlock), + 151: syscalls.Undocumented("mlockall", Mlockall), + 152: syscalls.Undocumented("munlockall", Munlockall), + 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil), + 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil), + 157: syscalls.Undocumented("prctl", Prctl), + 158: syscalls.Undocumented("arch_prctl", ArchPrctl), + 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 160: syscalls.Undocumented("setrlimit", Setrlimit), + 161: syscalls.Undocumented("chroot", Chroot), + 162: syscalls.Undocumented("sync", Sync), + 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 165: syscalls.Undocumented("mount", Mount), + 166: syscalls.Undocumented("umount2", Umount2), + 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 170: syscalls.Undocumented("sethostname", Sethostname), + 171: syscalls.Undocumented("setdomainname", Setdomainname), + 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), + 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), + 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), + 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil), + 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil), + 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil), + 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), + 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil), + 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil), + 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil), + 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil), + 186: syscalls.Undocumented("gettid", Gettid), + 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341) + 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil), + 200: syscalls.Undocumented("tkill", Tkill), + 201: syscalls.Undocumented("time", Time), + 202: syscalls.Undocumented("futex", Futex), + 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity), + 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity), + 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.Undocumented("io_setup", IoSetup), + 207: syscalls.Undocumented("io_destroy", IoDestroy), + 208: syscalls.Undocumented("io_getevents", IoGetevents), + 209: syscalls.Undocumented("io_submit", IoSubmit), + 210: syscalls.Undocumented("io_cancel", IoCancel), + 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 213: syscalls.Undocumented("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil), + 217: syscalls.Undocumented("getdents64", Getdents64), + 218: syscalls.Undocumented("set_tid_address", SetTidAddress), + 219: syscalls.Undocumented("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920) + 221: syscalls.Undocumented("fadvise64", Fadvise64), + 222: syscalls.Undocumented("timer_create", TimerCreate), + 223: syscalls.Undocumented("timer_settime", TimerSettime), + 224: syscalls.Undocumented("timer_gettime", TimerGettime), + 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Undocumented("timer_delete", TimerDelete), + 227: syscalls.Undocumented("clock_settime", ClockSettime), + 228: syscalls.Undocumented("clock_gettime", ClockGettime), + 229: syscalls.Undocumented("clock_getres", ClockGetres), + 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep), + 231: syscalls.Undocumented("exit_group", ExitGroup), + 232: syscalls.Undocumented("epoll_wait", EpollWait), + 233: syscalls.Undocumented("epoll_ctl", EpollCtl), + 234: syscalls.Undocumented("tgkill", Tgkill), + 235: syscalls.Undocumented("utimes", Utimes), + 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil), + 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 238: syscalls.Undocumented("set_mempolicy", SetMempolicy), + 239: syscalls.Undocumented("get_mempolicy", GetMempolicy), + 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 247: syscalls.Undocumented("waitid", Waitid), + 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil), + 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil), + 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil), + 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 253: syscalls.Undocumented("inotify_init", InotifyInit), + 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch), + 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch), + 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 257: syscalls.Undocumented("openat", Openat), + 258: syscalls.Undocumented("mkdirat", Mkdirat), + 259: syscalls.Undocumented("mknodat", Mknodat), + 260: syscalls.Undocumented("fchownat", Fchownat), + 261: syscalls.Undocumented("futimesat", Futimesat), + 262: syscalls.Undocumented("fstatat", Fstatat), + 263: syscalls.Undocumented("unlinkat", Unlinkat), + 264: syscalls.Undocumented("renameat", Renameat), + 265: syscalls.Undocumented("linkat", Linkat), + 266: syscalls.Undocumented("symlinkat", Symlinkat), + 267: syscalls.Undocumented("readlinkat", Readlinkat), + 268: syscalls.Undocumented("fchmodat", Fchmodat), + 269: syscalls.Undocumented("faccessat", Faccessat), + 270: syscalls.Undocumented("pselect", Pselect), + 271: syscalls.Undocumented("ppoll", Ppoll), + 272: syscalls.Undocumented("unshare", Unshare), + 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil), + 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil), + 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 277: syscalls.Undocumented("sync_file_range", SyncFileRange), + 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Undocumented("utimensat", Utimensat), + 281: syscalls.Undocumented("epoll_pwait", EpollPwait), + 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 283: syscalls.Undocumented("timerfd_create", TimerfdCreate), + 284: syscalls.Undocumented("eventfd", Eventfd), + 285: syscalls.Undocumented("fallocate", Fallocate), + 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime), + 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime), + 288: syscalls.Undocumented("accept4", Accept4), + 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426) + 290: syscalls.Undocumented("eventfd2", Eventfd2), + 291: syscalls.Undocumented("epoll_create1", EpollCreate1), + 292: syscalls.Undocumented("dup3", Dup3), + 293: syscalls.Undocumented("pipe2", Pipe2), + 294: syscalls.Undocumented("inotify_init1", InotifyInit1), + 295: syscalls.Undocumented("preadv", Preadv), + 296: syscalls.Undocumented("pwritev", Pwritev), + 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil), + 299: syscalls.Undocumented("recvmmsg", RecvMMsg), + 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Undocumented("prlimit64", Prlimit64), + 303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), + 304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil), + 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 306: syscalls.Undocumented("syncfs", Syncfs), + 307: syscalls.Undocumented("sendmmsg", SendMMsg), + 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Undocumented("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Undocumented("seccomp", Seccomp), + 318: syscalls.Undocumented("getrandom", GetRandom), + 319: syscalls.Undocumented("memfd_create", MemfdCreate), + 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), + 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836) + 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897) + 325: syscalls.Undocumented("mlock2", Mlock2), + // Syscalls after 325 are "backports" from versions of Linux after 4.4. - // 326: @Syscall(CopyFileRange), - 327: Preadv2, - 328: Pwritev2, + 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil), + 327: syscalls.Undocumented("preadv2", Preadv2), + 328: syscalls.Undocumented("pwritev2", Pwritev2), }, Emulate: map[usermem.Addr]uintptr{ diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go new file mode 100644 index 000000000..652b2c206 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -0,0 +1,312 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// We unconditionally report a single NUMA node. This also means that our +// "nodemask_t" is a single unsigned long (uint64). +const ( + maxNodes = 1 + allowedNodemask = (1 << maxNodes) - 1 +) + +func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) { + // "nodemask points to a bit mask of node IDs that contains up to maxnode + // bits. The bit mask size is rounded to the next multiple of + // sizeof(unsigned long), but the kernel will use bits only up to maxnode. + // A NULL value of nodemask or a maxnode value of zero specifies the empty + // set of nodes. If the value of maxnode is zero, the nodemask argument is + // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate + // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses + // maxnode-1, not maxnode, as the number of bits. + bits := maxnode - 1 + if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0 + return 0, syserror.EINVAL + } + if bits == 0 { + return 0, nil + } + // Copy in the whole nodemask. + numUint64 := (bits + 63) / 64 + buf := t.CopyScratchBuffer(int(numUint64) * 8) + if _, err := t.CopyInBytes(addr, buf); err != nil { + return 0, err + } + val := usermem.ByteOrder.Uint64(buf) + // Check that only allowed bits in the first unsigned long in the nodemask + // are set. + if val&^allowedNodemask != 0 { + return 0, syserror.EINVAL + } + // Check that all remaining bits in the nodemask are 0. + for i := 8; i < len(buf); i++ { + if buf[i] != 0 { + return 0, syserror.EINVAL + } + } + return val, nil +} + +func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error { + // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of + // bits. + bits := maxnode - 1 + if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0 + return syserror.EINVAL + } + if bits == 0 { + return nil + } + // Copy out the first unsigned long in the nodemask. + buf := t.CopyScratchBuffer(8) + usermem.ByteOrder.PutUint64(buf, val) + if _, err := t.CopyOutBytes(addr, buf); err != nil { + return err + } + // Zero out remaining unsigned longs in the nodemask. + if bits > 64 { + remAddr, ok := addr.AddLength(8) + if !ok { + return syserror.EFAULT + } + remUint64 := (bits - 1) / 64 + if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return err + } + } + return nil +} + +// GetMempolicy implements the syscall get_mempolicy(2). +func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + mode := args[0].Pointer() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + addr := args[3].Pointer() + flags := args[4].Uint() + + if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 { + return 0, nil, syserror.EINVAL + } + nodeFlag := flags&linux.MPOL_F_NODE != 0 + addrFlag := flags&linux.MPOL_F_ADDR != 0 + memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 + + // "EINVAL: The value specified by maxnode is less than the number of node + // IDs supported by the system." - get_mempolicy(2) + if nodemask != 0 && maxnode < maxNodes { + return 0, nil, syserror.EINVAL + } + + // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is + // ignored and the set of nodes (memories) that the thread is allowed to + // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the + // absence of any mode flags) is returned in nodemask." + if memsAllowed { + // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either + // MPOL_F_ADDR or MPOL_F_NODE." + if nodeFlag || addrFlag { + return 0, nil, syserror.EINVAL + } + if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil { + return 0, nil, err + } + return 0, nil, nil + } + + // "If flags specifies MPOL_F_ADDR, then information is returned about the + // policy governing the memory address given in addr. ... If the mode + // argument is not NULL, then get_mempolicy() will store the policy mode + // and any optional mode flags of the requested NUMA policy in the location + // pointed to by this argument. If nodemask is not NULL, then the nodemask + // associated with the policy will be stored in the location pointed to by + // this argument." + if addrFlag { + policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr) + if err != nil { + return 0, nil, err + } + if nodeFlag { + // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR, + // get_mempolicy() will return the node ID of the node on which the + // address addr is allocated into the location pointed to by mode. + // If no page has yet been allocated for the specified address, + // get_mempolicy() will allocate a page as if the thread had + // performed a read (load) access to that address, and return the + // ID of the node where that page was allocated." + buf := t.CopyScratchBuffer(1) + _, err := t.CopyInBytes(addr, buf) + if err != nil { + return 0, nil, err + } + policy = 0 // maxNodes == 1 + } + if mode != 0 { + if _, err := t.CopyOut(mode, policy); err != nil { + return 0, nil, err + } + } + if nodemask != 0 { + if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil + } + + // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did + // not specify MPOL_F_ADDR and addr is not NULL." This is partially + // inaccurate: if flags specifies MPOL_F_ADDR, + // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will + // just (usually) fail to find a VMA at address 0 and return EFAULT. + if addr != 0 { + return 0, nil, syserror.EINVAL + } + + // "If flags is specified as 0, then information about the calling thread's + // default policy (as set by set_mempolicy(2)) is returned, in the buffers + // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but + // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE, + // then get_mempolicy() will return in the location pointed to by a + // non-NULL mode argument, the node ID of the next node that will be used + // for interleaving of internal kernel pages allocated on behalf of the + // thread." + policy, nodemaskVal := t.NumaPolicy() + if nodeFlag { + if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE { + return 0, nil, syserror.EINVAL + } + policy = 0 // maxNodes == 1 + } + if mode != 0 { + if _, err := t.CopyOut(mode, policy); err != nil { + return 0, nil, err + } + } + if nodemask != 0 { + if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// SetMempolicy implements the syscall set_mempolicy(2). +func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + modeWithFlags := args[0].Int() + nodemask := args[1].Pointer() + maxnode := args[2].Uint() + + modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode) + if err != nil { + return 0, nil, err + } + + t.SetNumaPolicy(modeWithFlags, nodemaskVal) + return 0, nil, nil +} + +// Mbind implements the syscall mbind(2). +func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Uint64() + mode := args[2].Int() + nodemask := args[3].Pointer() + maxnode := args[4].Uint() + flags := args[5].Uint() + + if flags&^linux.MPOL_MF_VALID != 0 { + return 0, nil, syserror.EINVAL + } + // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be + // privileged (CAP_SYS_NICE) to use this flag." - mbind(2) + if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) { + return 0, nil, syserror.EPERM + } + + mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode) + if err != nil { + return 0, nil, err + } + + // Since we claim to have only a single node, all flags can be ignored + // (since all pages must already be on that single node). + err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal) + return 0, nil, err +} + +func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) { + flags := modeWithFlags & linux.MPOL_MODE_FLAGS + mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS + if flags == linux.MPOL_MODE_FLAGS { + // Can't specify both mode flags simultaneously. + return 0, 0, syserror.EINVAL + } + if mode < 0 || mode >= linux.MPOL_MAX { + // Must specify a valid mode. + return 0, 0, syserror.EINVAL + } + + var nodemaskVal uint64 + if nodemask != 0 { + var err error + nodemaskVal, err = copyInNodemask(t, nodemask, maxnode) + if err != nil { + return 0, 0, err + } + } + + switch mode { + case linux.MPOL_DEFAULT: + // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate; + // Linux allows a nodemask to be specified, as long as it is empty. + if nodemaskVal != 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_BIND, linux.MPOL_INTERLEAVE: + // These require a non-empty nodemask. + if nodemaskVal == 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_PREFERRED: + // This permits an empty nodemask, as long as no flags are set. + if nodemaskVal == 0 && flags != 0 { + return 0, 0, syserror.EINVAL + } + case linux.MPOL_LOCAL: + // This requires an empty nodemask and no flags set ... + if nodemaskVal != 0 || flags != 0 { + return 0, 0, syserror.EINVAL + } + // ... and is implemented as MPOL_PREFERRED. + mode = linux.MPOL_PREFERRED + default: + // Unknown mode, which we should have rejected above. + panic(fmt.Sprintf("unknown mode: %v", mode)) + } + + return mode | flags, nodemaskVal, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 64a6e639c..9926f0ac5 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -204,151 +204,6 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } } -func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) { - if ptr != 0 { - return t.CopyOut(ptr, val) - } - return 0, nil -} - -// GetMempolicy implements the syscall get_mempolicy(2). -func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - mode := args[0].Pointer() - nodemask := args[1].Pointer() - maxnode := args[2].Uint() - addr := args[3].Pointer() - flags := args[4].Uint() - - memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0 - nodeFlag := flags&linux.MPOL_F_NODE != 0 - addrFlag := flags&linux.MPOL_F_ADDR != 0 - - // TODO(rahat): Once sysfs is implemented, report a single numa node in - // /sys/devices/system/node. - if nodemask != 0 && maxnode < 1 { - return 0, nil, syserror.EINVAL - } - - // 'addr' provided iff 'addrFlag' set. - if addrFlag == (addr == 0) { - return 0, nil, syserror.EINVAL - } - - // Default policy for the thread. - if flags == 0 { - policy, nodemaskVal := t.NumaPolicy() - if _, err := copyOutIfNotNull(t, mode, policy); err != nil { - return 0, nil, syserror.EFAULT - } - if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil { - return 0, nil, syserror.EFAULT - } - return 0, nil, nil - } - - // Report all nodes available to caller. - if memsAllowed { - // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED. - if nodeFlag || addrFlag { - return 0, nil, syserror.EINVAL - } - - // Report a single numa node. - if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil { - return 0, nil, syserror.EFAULT - } - return 0, nil, nil - } - - if addrFlag { - if nodeFlag { - // Return the id for the node where 'addr' resides, via 'mode'. - // - // The real get_mempolicy(2) allocates the page referenced by 'addr' - // by simulating a read, if it is unallocated before the call. It - // then returns the node the page is allocated on through the mode - // pointer. - b := t.CopyScratchBuffer(1) - _, err := t.CopyInBytes(addr, b) - if err != nil { - return 0, nil, syserror.EFAULT - } - if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { - return 0, nil, syserror.EFAULT - } - } else { - storedPolicy, _ := t.NumaPolicy() - // Return the policy governing the memory referenced by 'addr'. - if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil { - return 0, nil, syserror.EFAULT - } - } - return 0, nil, nil - } - - storedPolicy, _ := t.NumaPolicy() - if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) { - // Policy for current thread is to interleave memory between - // nodes. Return the next node we'll allocate on. Since we only have a - // single node, this is always node 0. - if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil { - return 0, nil, syserror.EFAULT - } - return 0, nil, nil - } - - return 0, nil, syserror.EINVAL -} - -func allowedNodesMask() uint32 { - const maxNodes = 1 - return ^uint32((1 << maxNodes) - 1) -} - -// SetMempolicy implements the syscall set_mempolicy(2). -func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - modeWithFlags := args[0].Int() - nodemask := args[1].Pointer() - maxnode := args[2].Uint() - - if nodemask != 0 && maxnode < 1 { - return 0, nil, syserror.EINVAL - } - - if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS { - // Can't specify multiple modes simultaneously. - return 0, nil, syserror.EINVAL - } - - mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS - if mode < 0 || mode >= linux.MPOL_MAX { - // Must specify a valid mode. - return 0, nil, syserror.EINVAL - } - - var nodemaskVal uint32 - // Nodemask may be empty for some policy modes. - if nodemask != 0 && maxnode > 0 { - if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil { - return 0, nil, syserror.EFAULT - } - } - - if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 { - // Mode requires a non-empty nodemask, but got an empty nodemask. - return 0, nil, syserror.EINVAL - } - - if nodemaskVal&allowedNodesMask() != 0 { - // Invalid node specified. - return 0, nil, syserror.EINVAL - } - - t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal) - - return 0, nil, nil -} - // Mincore implements the syscall mincore(2). func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 117ae1a0e..1b7e5616b 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -15,6 +15,7 @@ package linux import ( + "fmt" "syscall" "gvisor.googlesource.com/gvisor/pkg/abi/linux" @@ -23,6 +24,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" ) // Prctl implements linux syscall prctl(2). @@ -44,6 +46,33 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal())) return 0, nil, err + case linux.PR_GET_DUMPABLE: + d := t.MemoryManager().Dumpability() + switch d { + case mm.NotDumpable: + return linux.SUID_DUMP_DISABLE, nil, nil + case mm.UserDumpable: + return linux.SUID_DUMP_USER, nil, nil + case mm.RootDumpable: + return linux.SUID_DUMP_ROOT, nil, nil + default: + panic(fmt.Sprintf("Unknown dumpability %v", d)) + } + + case linux.PR_SET_DUMPABLE: + var d mm.Dumpability + switch args[1].Int() { + case linux.SUID_DUMP_DISABLE: + d = mm.NotDumpable + case linux.SUID_DUMP_USER: + d = mm.UserDumpable + default: + // N.B. Userspace may not pass SUID_DUMP_ROOT. + return 0, nil, syscall.EINVAL + } + t.MemoryManager().SetDumpability(d) + return 0, nil, nil + case linux.PR_GET_KEEPCAPS: if t.Credentials().KeepCaps { return 1, nil, nil @@ -171,9 +200,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return 0, nil, t.DropBoundingCapability(cp) - case linux.PR_GET_DUMPABLE, - linux.PR_SET_DUMPABLE, - linux.PR_GET_TIMING, + case linux.PR_GET_TIMING, linux.PR_SET_TIMING, linux.PR_GET_TSC, linux.PR_SET_TSC, diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 8f4dbf3bc..31295a6a9 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -188,7 +188,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } // Create the new socket. - s, e := socket.New(t, domain, transport.SockType(stype&0xf), protocol) + s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol) if e != nil { return 0, nil, e.ToError() } @@ -227,7 +227,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } // Create the socket pair. - s1, s2, e := socket.Pair(t, domain, transport.SockType(stype&0xf), protocol) + s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) if e != nil { return 0, nil, e.ToError() } diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go index 5d10b3824..48c114232 100644 --- a/pkg/sentry/syscalls/syscalls.go +++ b/pkg/sentry/syscalls/syscalls.go @@ -25,37 +25,97 @@ package syscalls import ( + "fmt" + "syscall" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/syserror" ) +// Supported returns a syscall that is fully supported. +func Supported(name string, fn kernel.SyscallFn) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportFull, + Note: "Full Support", + } +} + +// Undocumented returns a syscall that is undocumented. +func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportUndocumented, + } +} + +// PartiallySupported returns a syscall that has a partial implementation. +func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall { + return kernel.Syscall{ + Name: name, + Fn: fn, + SupportLevel: kernel.SupportPartial, + Note: note, + URLs: urls, + } +} + // Error returns a syscall handler that will always give the passed error. -func Error(err error) kernel.SyscallFn { - return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, err +func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, err + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + URLs: urls, } } // ErrorWithEvent gives a syscall function that sends an unimplemented // syscall event via the event channel and returns the passed error. -func ErrorWithEvent(err error) kernel.SyscallFn { - return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, err +func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, err + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q", note, err.Error()), + URLs: urls, } } // CapError gives a syscall function that checks for capability c. If the task // has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged // tasks, it will seem like there is an implementation. -func CapError(c linux.Capability) kernel.SyscallFn { - return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - if !t.HasCapability(c) { - return 0, nil, syserror.EPERM - } - t.Kernel().EmitUnimplementedEvent(t) - return 0, nil, syserror.ENOSYS +func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall { + if note != "" { + note = note + "; " + } + return kernel.Syscall{ + Name: name, + Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if !t.HasCapability(c) { + return 0, nil, syserror.EPERM + } + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + }, + SupportLevel: kernel.SupportUnimplemented, + Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS), + URLs: urls, } } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index b2f8f6832..b50579a92 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -32,7 +32,7 @@ go_library( "tsc_arm64.s", ], importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/time", - visibility = ["//pkg/sentry:internal"], + visibility = ["//:sandbox"], deps = [ "//pkg/log", "//pkg/metric", diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index 09198496b..860733061 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -17,6 +17,6 @@ go_library( ], deps = [ "//pkg/bits", - "//pkg/sentry/memutil", + "//pkg/memutil", ], ) diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index c316f1597..9ed974ccb 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -22,7 +22,7 @@ import ( "syscall" "gvisor.googlesource.com/gvisor/pkg/bits" - "gvisor.googlesource.com/gvisor/pkg/sentry/memutil" + "gvisor.googlesource.com/gvisor/pkg/memutil" ) // MemoryKind represents a type of memory used by the application. diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go index 31e4d6ada..9dde327a2 100644 --- a/pkg/sentry/usermem/usermem.go +++ b/pkg/sentry/usermem/usermem.go @@ -222,9 +222,11 @@ func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts return int(r.Addr - addr), nil } -// copyStringIncrement is the maximum number of bytes that are copied from -// virtual memory at a time by CopyStringIn. -const copyStringIncrement = 64 +// CopyStringIn tuning parameters, defined outside that function for tests. +const ( + copyStringIncrement = 64 + copyStringMaxInitBufLen = 256 +) // CopyStringIn copies a NUL-terminated string of unknown length from the // memory mapped at addr in uio and returns it as a string (not including the @@ -234,31 +236,38 @@ const copyStringIncrement = 64 // // Preconditions: As for IO.CopyFromUser. maxlen >= 0. func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) { - buf := make([]byte, maxlen) + initLen := maxlen + if initLen > copyStringMaxInitBufLen { + initLen = copyStringMaxInitBufLen + } + buf := make([]byte, initLen) var done int for done < maxlen { - start, ok := addr.AddLength(uint64(done)) - if !ok { - // Last page of kernel memory. The application can't use this - // anyway. - return stringFromImmutableBytes(buf[:done]), syserror.EFAULT - } // Read up to copyStringIncrement bytes at a time. readlen := copyStringIncrement if readlen > maxlen-done { readlen = maxlen - done } - end, ok := start.AddLength(uint64(readlen)) + end, ok := addr.AddLength(uint64(readlen)) if !ok { return stringFromImmutableBytes(buf[:done]), syserror.EFAULT } // Shorten the read to avoid crossing page boundaries, since faulting // in a page unnecessarily is expensive. This also ensures that partial // copies up to the end of application-mappable memory succeed. - if start.RoundDown() != end.RoundDown() { + if addr.RoundDown() != end.RoundDown() { end = end.RoundDown() + readlen = int(end - addr) + } + // Ensure that our buffer is large enough to accommodate the read. + if done+readlen > len(buf) { + newBufLen := len(buf) * 2 + if newBufLen > maxlen { + newBufLen = maxlen + } + buf = append(buf, make([]byte, newBufLen-len(buf))...) } - n, err := uio.CopyIn(ctx, start, buf[done:done+int(end-start)], opts) + n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) // Look for the terminating zero byte, which may have occurred before // hitting err. for i, c := range buf[done : done+n] { @@ -270,6 +279,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt if err != nil { return stringFromImmutableBytes(buf[:done]), err } + addr = end } return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG } diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go index 4a07118b7..575e5039d 100644 --- a/pkg/sentry/usermem/usermem_test.go +++ b/pkg/sentry/usermem/usermem_test.go @@ -192,6 +192,7 @@ func TestCopyObject(t *testing.T) { } func TestCopyStringInShort(t *testing.T) { + // Tests for string length <= copyStringIncrement. want := strings.Repeat("A", copyStringIncrement-2) mem := want + "\x00" if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { @@ -200,13 +201,25 @@ func TestCopyStringInShort(t *testing.T) { } func TestCopyStringInLong(t *testing.T) { - want := strings.Repeat("A", copyStringIncrement+1) + // Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen + // (requiring multiple calls to IO.CopyIn()). + want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4) mem := want + "\x00" if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) } } +func TestCopyStringInVeryLong(t *testing.T) { + // Tests for string length > copyStringMaxInitBufLen (requiring buffer + // reallocation). + want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { want := strings.Repeat("A", copyStringIncrement-1) got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) |