diff options
Diffstat (limited to 'pkg/sentry')
78 files changed, 3451 insertions, 1442 deletions
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index d16d78aa5..e74275d2d 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -20,9 +20,11 @@ go_library( "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/host", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/host", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -36,6 +38,7 @@ go_library( "//pkg/syserror", "//pkg/tcpip/link/sniffer", "//pkg/urpc", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index b51fb3959..2ed17ee09 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -24,13 +24,16 @@ import ( "text/tabwriter" "time" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -84,15 +87,13 @@ type ExecArgs struct { // the root group if not set explicitly. KGID auth.KGID - // ExtraKGIDs is the list of additional groups to which the user - // belongs. + // ExtraKGIDs is the list of additional groups to which the user belongs. ExtraKGIDs []auth.KGID // Capabilities is the list of capabilities to give to the process. Capabilities *auth.TaskCapabilities - // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host - // pty FD. + // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. StdioIsPty bool // FilePayload determines the files to give to the new process. @@ -117,7 +118,7 @@ func (args ExecArgs) String() string { // Exec runs a new task. func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { - newTG, _, _, err := proc.execAsync(args) + newTG, _, _, _, err := proc.execAsync(args) if err != nil { return err } @@ -130,26 +131,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined // as a function rather than a method to avoid exposing execAsync as an RPC. -func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { +func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { return proc.execAsync(args) } // execAsync runs a new task, but doesn't wait for it to finish. It returns the // newly created thread group and its PID. If the stdio FDs are TTYs, then a // TTYFileOperations that wraps the TTY is also returned. -func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { +func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() defer fdTable.DecRef() - // No matter what happens, we should close all files in the FilePayload - // before returning. Any files that are imported will be duped. - defer func() { - for _, f := range args.FilePayload.Files { - f.Close() - } - }() - creds := auth.NewUserCredentials( args.KUID, args.KGID, @@ -202,7 +195,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI vfsObj := proc.Kernel.VFS() file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) } initArgs.File = fsbridge.NewVFSFile(file) } else { @@ -214,74 +207,57 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // initArgs must hold a reference on MountNamespace, which will // be donated to the new process in CreateProcess. - initArgs.MountNamespaceVFS2.IncRef() + initArgs.MountNamespace.IncRef() } f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) } initArgs.Filename = f } } - // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2. - var ttyFile *fs.File - for appFD, hostFile := range args.FilePayload.Files { - var appFile *fs.File - - if args.StdioIsPty && appFD < 3 { - // Import the file as a host TTY file. - if ttyFile == nil { - var err error - appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), true /* isTTY */) - if err != nil { - return nil, 0, nil, err - } - defer appFile.DecRef() - - // Remember this in the TTY file, as we will - // use it for the other stdio FDs. - ttyFile = appFile - } else { - // Re-use the existing TTY file, as all three - // stdio FDs must point to the same fs.File in - // order to share TTY state, specifically the - // foreground process group id. - appFile = ttyFile - } - } else { - // Import the file as a regular host file. - var err error - appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), false /* isTTY */) + fds := make([]int, len(args.FilePayload.Files)) + for i, file := range args.FilePayload.Files { + if kernel.VFS2Enabled { + // Need to dup to remove ownership from os.File. + dup, err := unix.Dup(int(file.Fd())) if err != nil { - return nil, 0, nil, err + return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err) } - defer appFile.DecRef() + fds[i] = dup + } else { + // VFS1 dups the file on import. + fds[i] = int(file.Fd()) } - - // Add the file to the FD map. - if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { - return nil, 0, nil, err + } + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) + if err != nil { + if kernel.VFS2Enabled { + for _, fd := range fds { + unix.Close(fd) + } } + return nil, 0, nil, nil, err } tg, tid, err := proc.Kernel.CreateProcess(initArgs) if err != nil { - return nil, 0, nil, err + return nil, 0, nil, nil, err } - var ttyFileOps *host.TTYFileOperations - if ttyFile != nil { - // Set the foreground process group on the TTY before starting - // the process. - ttyFileOps = ttyFile.FileOperations.(*host.TTYFileOperations) - ttyFileOps.InitForegroundProcessGroup(tg.ProcessGroup()) + // Set the foreground process group on the TTY before starting the process. + switch { + case ttyFile != nil: + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFileVFS2 != nil: + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) } // Start the newly created process. proc.Kernel.StartProcess(tg) - return tg, tid, ttyFileOps, nil + return tg, tid, ttyFile, ttyFileVFS2, nil } // PsArgs is the set of arguments to ps. diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD new file mode 100644 index 000000000..5e41ceb4e --- /dev/null +++ b/pkg/sentry/fdimport/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "fdimport", + srcs = [ + "fdimport.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/host", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/kernel", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go new file mode 100644 index 000000000..a4199f9e9 --- /dev/null +++ b/pkg/sentry/fdimport/fdimport.go @@ -0,0 +1,129 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdimport + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Import imports a slice of FDs into the given FDTable. If console is true, +// sets up TTY for the first 3 FDs in the slice representing stdin, stdout, +// stderr. Upon success, Import takes ownership of all FDs. +func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + if kernel.VFS2Enabled { + ttyFile, err := importVFS2(ctx, fdTable, console, fds) + return nil, ttyFile, err + } + ttyFile, err := importFS(ctx, fdTable, console, fds) + return ttyFile, nil, err +} + +func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) { + var ttyFile *fs.File + for appFD, hostFD := range fds { + var appFile *fs.File + + if console && appFD < 3 { + // Import the file as a host TTY file. + if ttyFile == nil { + var err error + appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + + // Remember this in the TTY file, as we will + // use it for the other stdio FDs. + ttyFile = appFile + } else { + // Re-use the existing TTY file, as all three + // stdio FDs must point to the same fs.File in + // order to share TTY state, specifically the + // foreground process group id. + appFile = ttyFile + } + } else { + // Import the file as a regular host file. + var err error + appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + } + + // Add the file to the FD map. + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + return nil, err + } + } + + if ttyFile == nil { + return nil, nil + } + return ttyFile.FileOperations.(*host.TTYFileOperations), nil +} + +func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) { + k := kernel.KernelFromContext(ctx) + + var ttyFile *vfs.FileDescription + for appFD, hostFD := range stdioFDs { + var appFile *vfs.FileDescription + + if console && appFD < 3 { + // Import the file as a host TTY file. + if ttyFile == nil { + var err error + appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + + // Remember this in the TTY file, as we will use it for the other stdio + // FDs. + ttyFile = appFile + } else { + // Re-use the existing TTY file, as all three stdio FDs must point to + // the same fs.File in order to share TTY state, specifically the + // foreground process group id. + appFile = ttyFile + } + } else { + var err error + appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + } + + if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + return nil, err + } + } + + if ttyFile == nil { + return nil, nil + } + return ttyFile.Impl().(*hostvfs2.TTYFileDescription), nil +} diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index 6db4b762d..d481baf77 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -75,10 +75,18 @@ func owner(mounter fs.FileOwner, valid p9.AttrMask, pattr p9.Attr) fs.FileOwner // task's EUID/EGID. owner := mounter if valid.UID { - owner.UID = auth.KUID(pattr.UID) + if pattr.UID.Ok() { + owner.UID = auth.KUID(pattr.UID) + } else { + owner.UID = auth.KUID(auth.OverflowUID) + } } if valid.GID { - owner.GID = auth.KGID(pattr.GID) + if pattr.GID.Ok() { + owner.GID = auth.KGID(pattr.GID) + } else { + owner.GID = auth.KGID(auth.OverflowGID) + } } return owner } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 10ba2f5f0..40f2c1cad 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -47,6 +47,8 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return &endpoint{inode, i.fileState.file.file, path} } +// LINT.IfChange + // endpoint is a Gofer-backed transport.BoundEndpoint. // // An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint() @@ -146,3 +148,5 @@ func (e *endpoint) Release() { func (e *endpoint) Passcred() bool { return false } + +// LINT.ThenChange(../../fsimpl/gofer/socket.go) diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go index 465b47da9..91617267d 100644 --- a/pkg/sentry/fs/proc/meminfo.go +++ b/pkg/sentry/fs/proc/meminfo.go @@ -58,12 +58,16 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) var buf bytes.Buffer fmt.Fprintf(&buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // We use MemFree as MemAvailable because we don't swap. // TODO(rahat): When reclaim is implemented the value of MemAvailable // should change. - fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree/1024) + fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree/1024) fmt.Fprintf(&buf, "Buffers: 0 kB\n") // memory usage by block devices fmt.Fprintf(&buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) // Emulate a system with no swap, which disables inactivation of anon pages. diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 181d765d3..94db8fe5c 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -59,7 +59,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // master inode. It returns the filesystem and root Dentry. func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { fs := &kernfs.Filesystem{} - fs.Init(vfsObj, fstype) + fs.VFSFilesystem().Init(vfsObj, fstype, fs) // Construct the root directory. This is always inode id 1. root := &rootInode{ diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 2c22a04af..77b644275 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -485,11 +485,14 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { - _, _, err := fs.walk(rp, false) +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + _, inode, err := fs.walk(rp, false) if err != nil { return nil, err } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } // TODO(b/134676337): Support sockets. return nil, syserror.ECONNREFUSED diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index b9c4beee4..5ce82b793 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -38,6 +38,7 @@ go_library( "p9file.go", "pagemath.go", "regular_file.go", + "socket.go", "special_file.go", "symlink.go", "time.go", @@ -52,18 +53,25 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/host", "//pkg/sentry/hostfd", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/syserr", "//pkg/syserror", "//pkg/unet", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 55f9ed911..b98218753 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -15,6 +15,7 @@ package gofer import ( + "fmt" "sync" "sync/atomic" @@ -22,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -59,28 +62,52 @@ func (d *dentry) cacheNegativeLookupLocked(name string) { d.children[name] = nil } -// createSyntheticDirectory creates a synthetic directory with the given name +type createSyntheticOpts struct { + name string + mode linux.FileMode + kuid auth.KUID + kgid auth.KGID + + // The endpoint for a synthetic socket. endpoint should be nil if the file + // being created is not a socket. + endpoint transport.BoundEndpoint + + // pipe should be nil if the file being created is not a pipe. + pipe *pipe.VFSPipe +} + +// createSyntheticChildLocked creates a synthetic file with the given name // in d. // // Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain // a child with the given name. -func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) { +func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { d2 := &dentry{ refs: 1, // held by d fs: d.fs, - mode: uint32(mode) | linux.S_IFDIR, - uid: uint32(kuid), - gid: uint32(kgid), + mode: uint32(opts.mode), + uid: uint32(opts.kuid), + gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary handle: handle{ fd: -1, }, nlink: uint32(2), } + switch opts.mode.FileType() { + case linux.S_IFDIR: + // Nothing else needs to be done. + case linux.S_IFSOCK: + d2.endpoint = opts.endpoint + case linux.S_IFIFO: + d2.pipe = opts.pipe + default: + panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) + } d2.pf.dentry = d2 d2.vfsd.Init(d2) - d.cacheNewChildLocked(d2, name) + d.cacheNewChildLocked(d2, opts.name) d.syntheticChildren++ } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 98ccb42fd..4a32821bd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -22,9 +22,11 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -652,7 +654,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) - parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) } if fs.opts.interop != InteropModeShared { parent.incLinks() @@ -663,7 +670,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // Can't create non-synthetic files in synthetic directories. return syserror.EPERM } - parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) parent.incLinks() return nil }) @@ -674,6 +686,30 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // If the gofer does not allow creating a socket or pipe, create a + // synthetic one, i.e. one that is kept entirely in memory. + if err == syserror.EPERM { + switch opts.Mode.FileType() { + case linux.S_IFSOCK: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + endpoint: opts.Endpoint, + }) + return nil + case linux.S_IFIFO: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + }) + return nil + } + } return err }, nil) } @@ -756,20 +792,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } mnt := rp.Mount() - filetype := d.fileType() - switch { - case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { - return nil, err - } - fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { - return nil, err + switch d.fileType() { + case linux.S_IFREG: + if !d.fs.opts.regularFilesUseSpecialFileFD { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + return nil, err + } + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil } - return &fd.vfsfd, nil - case filetype == linux.S_IFDIR: + case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, syserror.EISDIR @@ -791,26 +828,40 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } return &fd.vfsfd, nil - case filetype == linux.S_IFLNK: + case linux.S_IFLNK: // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP - default: - if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL - } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) - if err != nil { - return nil, err - } - fd := &specialFileFD{ - handle: h, + case linux.S_IFSOCK: + if d.isSynthetic() { + return nil, syserror.ENXIO } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - h.close(ctx) - return nil, err + case linux.S_IFIFO: + if d.isSynthetic() { + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) } - return &fd.vfsfd, nil } + return d.openSpecialFileLocked(ctx, mnt, opts) +} + +func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + // Treat as a special file. This is done for non-synthetic pipes as well as + // regular files when d.fs.opts.regularFilesUseSpecialFileFD is true. + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) + if err != nil { + return nil, err + } + fd := &specialFileFD{ + handle: h, + } + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil } // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. @@ -1196,15 +1247,28 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) - _, err := fs.resolveLocked(ctx, rp, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } - // TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if d.isSocket() { + if !d.isSynthetic() { + d.IncRef() + return &endpoint{ + dentry: d, + file: d.file.file, + path: opts.Addr, + }, nil + } + return d.endpoint, nil + } return nil, syserror.ECONNREFUSED } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 8b4e91d17..9ab8fdc65 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -46,9 +46,11 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" @@ -70,7 +72,8 @@ type filesystem struct { mfp pgalloc.MemoryFileProvider // Immutable options. - opts filesystemOptions + opts filesystemOptions + iopts InternalFilesystemOptions // client is the client used by this filesystem. client is immutable. client *p9.Client @@ -207,6 +210,16 @@ const ( InteropModeShared ) +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +type InternalFilesystemOptions struct { + // If LeakConnection is true, do not close the connection to the server + // when the Filesystem is released. This is necessary for deployments in + // which servers can handle only a single client and report failure if that + // client disconnects. + LeakConnection bool +} + // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name @@ -345,6 +358,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } + // Handle internal options. + iopts, ok := opts.InternalData.(InternalFilesystemOptions) + if opts.InternalData != nil && !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) + return nil, nil, syserror.EINVAL + } + // If !ok, iopts being the zero value is correct. + // Establish a connection with the server. conn, err := unet.NewSocket(fsopts.fd) if err != nil { @@ -381,6 +402,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs := &filesystem{ mfp: mfp, opts: fsopts, + iopts: iopts, uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, client: client, @@ -438,8 +460,10 @@ func (fs *filesystem) Release() { // fs. fs.syncMu.Unlock() - // Close the connection to the server. This implicitly clunks all fids. - fs.client.Close() + if !fs.iopts.LeakConnection { + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() + } } // dentry implements vfs.DentryImpl. @@ -472,10 +496,8 @@ type dentry struct { // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file - // that does not exist on the remote filesystem. As of this writing, this - // is only possible for a directory created with - // MkdirOptions.ForSyntheticMountpoint == true. - // TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes). + // that does not exist on the remote filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. file p9file // If deleted is non-zero, the file represented by this dentry has been @@ -585,6 +607,14 @@ type dentry struct { // and target are protected by dataMu. haveTarget bool target string + + // If this dentry represents a synthetic socket file, endpoint is the + // transport endpoint bound to this file. + endpoint transport.BoundEndpoint + + // If this dentry represents a synthetic named pipe, pipe is the pipe + // endpoint bound to this file. + pipe *pipe.VFSPipe } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -631,11 +661,11 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma }, } d.pf.dentry = d - if mask.UID && attr.UID != auth.NoID { - d.uid = uint32(attr.UID) + if mask.UID { + d.uid = dentryUIDFromP9UID(attr.UID) } - if mask.GID && attr.GID != auth.NoID { - d.gid = uint32(attr.GID) + if mask.GID { + d.gid = dentryGIDFromP9GID(attr.GID) } if mask.Size { d.size = attr.Size @@ -686,10 +716,10 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.mode, uint32(attr.Mode)) } if mask.UID { - atomic.StoreUint32(&d.uid, uint32(attr.UID)) + atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID)) } if mask.GID { - atomic.StoreUint32(&d.gid, uint32(attr.GID)) + atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID)) } // There is no P9_GETATTR_* bit for I/O block size. if attr.BlockSize != 0 { @@ -791,10 +821,21 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW + + // Prepare for truncate. + if stat.Mask&linux.STATX_SIZE != 0 { + switch d.mode & linux.S_IFMT { + case linux.S_IFREG: + if !setLocalMtime { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } } } d.metadataMu.Lock() @@ -896,6 +937,20 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +func dentryUIDFromP9UID(uid p9.UID) uint32 { + if !uid.Ok() { + return uint32(auth.OverflowUID) + } + return uint32(uid) +} + +func dentryGIDFromP9GID(gid p9.GID) uint32 { + if !gid.Ok() { + return uint32(auth.OverflowGID) + } + return uint32(gid) +} + // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { // d.refs may be 0 if d.fs.renameMu is locked, which serializes against @@ -1049,6 +1104,7 @@ func (d *dentry) destroyLocked() { d.handle.close(ctx) } d.handleMu.Unlock() + if !d.file.isNil() { d.file.close(ctx) d.file = p9file{} @@ -1134,7 +1190,7 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name return d.file.removeXattr(ctx, name) } -// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go new file mode 100644 index 000000000..d6dbe9092 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -0,0 +1,146 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/waiter" +) + +func (d *dentry) isSocket() bool { + return d.fileType() == linux.S_IFSOCK +} + +// endpoint is a Gofer-backed transport.BoundEndpoint. +// +// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() +// is called and either BoundEndpoint.BidirectionalConnect or +// BoundEndpoint.UnidirectionalConnect is called. +type endpoint struct { + // dentry is the filesystem dentry which produced this endpoint. + dentry *dentry + + // file is the p9 file that contains a single unopened fid. + file p9.File + + // path is the sentry path where this endpoint is bound. + path string +} + +func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { + switch t { + case linux.SOCK_STREAM: + return p9.StreamSocket, true + case linux.SOCK_SEQPACKET: + return p9.SeqpacketSocket, true + case linux.SOCK_DGRAM: + return p9.DgramSocket, true + } + return 0, false +} + +// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. +func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { + cf, ok := sockTypeToP9(ce.Type()) + if !ok { + return syserr.ErrConnectionRefused + } + + // No lock ordering required as only the ConnectingEndpoint has a mutex. + ce.Lock() + + // Check connecting state. + if ce.Connected() { + ce.Unlock() + return syserr.ErrAlreadyConnected + } + if ce.Listening() { + ce.Unlock() + return syserr.ErrInvalidEndpointState + } + + c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue()) + if err != nil { + ce.Unlock() + return err + } + + returnConnect(c, c) + ce.Unlock() + if err := c.Init(); err != nil { + return syserr.FromError(err) + } + + return nil +} + +// UnidirectionalConnect implements +// transport.BoundEndpoint.UnidirectionalConnect. +func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { + c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{}) + if err != nil { + return nil, err + } + + if err := c.Init(); err != nil { + return nil, syserr.FromError(err) + } + + // We don't need the receiver. + c.CloseRecv() + c.Release() + + return c, nil +} + +func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { + hostFile, err := e.file.Connect(flags) + if err != nil { + return nil, syserr.ErrConnectionRefused + } + // Dup the fd so that the new endpoint can manage its lifetime. + hostFD, err := syscall.Dup(hostFile.FD()) + if err != nil { + log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err) + return nil, syserr.FromError(err) + } + // After duplicating, we no longer need hostFile. + hostFile.Close() + + c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) + if serr != nil { + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr) + return nil, serr + } + return c, nil +} + +// Release implements transport.BoundEndpoint.Release. +func (e *endpoint) Release() { + e.dentry.DecRef() +} + +// Passcred implements transport.BoundEndpoint.Passcred. +func (e *endpoint) Passcred() bool { + return false +} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index e1c56d89b..39509f703 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -20,6 +20,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fdnotifier", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/sentry/arch", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 05538ea42..144e04905 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -24,6 +24,8 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -35,39 +37,12 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) -// filesystemType implements vfs.FilesystemType. -type filesystemType struct{} - -// GetFilesystem implements FilesystemType.GetFilesystem. -func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - panic("host.filesystemType.GetFilesystem should never be called") -} - -// Name implements FilesystemType.Name. -func (filesystemType) Name() string { - return "none" -} - -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - kernfs.Filesystem -} - -// NewFilesystem sets up and returns a new hostfs filesystem. -// -// Note that there should only ever be one instance of host.filesystem, -// a global mount for host fds. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} - fs.Init(vfsObj, filesystemType{}) - return fs.VFSFilesystem() -} - // ImportFD sets up and returns a vfs.FileDescription from a donated fd. func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { - fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) + fs, ok := mnt.Filesystem().Impl().(*filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } @@ -88,11 +63,12 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs seekable := err != syserror.ESPIPE i := &inode{ - hostFD: hostFD, - seekable: seekable, - isTTY: isTTY, - canMap: canMap(uint32(fileType)), - ino: fs.NextIno(), + hostFD: hostFD, + seekable: seekable, + isTTY: isTTY, + canMap: canMap(uint32(fileType)), + wouldBlock: wouldBlock(uint32(fileType)), + ino: fs.NextIno(), // For simplicity, set offset to 0. Technically, we should use the existing // offset on the host if the file is seekable. offset: 0, @@ -103,14 +79,60 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } + // If the hostFD would block, we must set it to non-blocking and handle + // blocking behavior in the sentry. + if i.wouldBlock { + if err := syscall.SetNonblock(i.hostFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { + return nil, err + } + } + d := &kernfs.Dentry{} d.Init(i) + // i.open will take a reference on d. defer d.DecRef() - return i.open(ctx, d.VFSDentry(), mnt) } +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("host.filesystemType.GetFilesystem should never be called") +} + +// Name implements FilesystemType.Name. +func (filesystemType) Name() string { + return "none" +} + +// NewFilesystem sets up and returns a new hostfs filesystem. +// +// Note that there should only ever be one instance of host.filesystem, +// a global mount for host fds. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.VFSFilesystem() +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + d := vd.Dentry().Impl().(*kernfs.Dentry) + inode := d.Inode().(*inode) + b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} +} + // inode implements kernfs.Inode. type inode struct { kernfs.InodeNotDirectory @@ -125,6 +147,12 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int + // wouldBlock is true if the host FD would return EWOULDBLOCK for + // operations that would block. + // + // This field is initialized at creation time and is immutable. + wouldBlock bool + // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return // EWOULDBLOCK for I/O operations. @@ -152,6 +180,9 @@ type inode struct { // offset specifies the current file offset. offset int64 + + // Event queue for blocking operations. + queue waiter.Queue } // Note that these flags may become out of date, since they can be modified @@ -354,6 +385,9 @@ func (i *inode) DecRef() { // Destroy implements kernfs.Inode. func (i *inode) Destroy() { + if i.wouldBlock { + fdnotifier.RemoveFD(int32(i.hostFD)) + } if err := unix.Close(i.hostFD); err != nil { log.Warningf("failed to close host fd %d: %v", i.hostFD, err) } @@ -385,7 +419,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F return nil, syserror.ENOTTY } - ep, err := newEndpoint(ctx, i.hostFD) + ep, err := newEndpoint(ctx, i.hostFD, &i.queue) if err != nil { return nil, err } @@ -396,7 +430,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that // we don't allow importing arbitrary file types without proper support. if i.isTTY { - fd := &ttyFD{ + fd := &TTYFileDescription{ fileDescription: fileDescription{inode: i}, termios: linux.DefaultSlaveTermios, } @@ -614,3 +648,20 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. return syserror.ENODEV } + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.inode.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(f.inode.hostFD)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *fileDescription) EventUnregister(e *waiter.Entry) { + f.inode.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(f.inode.hostFD)) +} + +// Readiness uses the poll() syscall to check the status of the underlying FD. +func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) +} diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 39dd624a0..38f1fbfba 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -34,17 +34,16 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// Create a new host-backed endpoint from the given fd. -func newEndpoint(ctx context.Context, hostFD int) (transport.Endpoint, error) { +// Create a new host-backed endpoint from the given fd and its corresponding +// notification queue. +func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) { // Set up an external transport.Endpoint using the host fd. addr := fmt.Sprintf("hostfd:[%d]", hostFD) - var q waiter.Queue - e, err := NewConnectedEndpoint(ctx, hostFD, &q, addr, true /* saveable */) + e, err := NewConnectedEndpoint(ctx, hostFD, addr, true /* saveable */) if err != nil { return nil, err.ToError() } - e.Init() - ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) + ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), queue, e, e) return ep, nil } @@ -77,8 +76,6 @@ type ConnectedEndpoint struct { // addr is the address at which this endpoint is bound. addr string - queue *waiter.Queue - // sndbuf is the size of the send buffer. // // N.B. When this is smaller than the host size, we present it via @@ -134,11 +131,10 @@ func (c *ConnectedEndpoint) init() *syserr.Error { // The caller is responsible for calling Init(). Additionaly, Release needs to // be called twice because ConnectedEndpoint is both a transport.Receiver and // transport.ConnectedEndpoint. -func NewConnectedEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) { +func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) { e := ConnectedEndpoint{ - fd: hostFD, - addr: addr, - queue: queue, + fd: hostFD, + addr: addr, } if err := e.init(); err != nil { @@ -151,13 +147,6 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, return &e, nil } -// Init will do the initialization required without holding other locks. -func (c *ConnectedEndpoint) Init() { - if err := fdnotifier.AddFD(int32(c.fd), c.queue); err != nil { - panic(err) - } -} - // Send implements transport.ConnectedEndpoint.Send. func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() @@ -332,7 +321,6 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { } func (c *ConnectedEndpoint) destroyLocked() { - fdnotifier.RemoveFD(int32(c.fd)) c.fd = -1 } @@ -350,14 +338,20 @@ func (c *ConnectedEndpoint) Release() { func (c *ConnectedEndpoint) CloseUnread() {} // SCMConnectedEndpoint represents an endpoint backed by a host fd that was -// passed through a gofer Unix socket. It is almost the same as -// ConnectedEndpoint, with the following differences: +// passed through a gofer Unix socket. It resembles ConnectedEndpoint, with the +// following differences: // - SCMConnectedEndpoint is not saveable, because the host cannot guarantee // the same descriptor number across S/R. -// - SCMConnectedEndpoint holds ownership of its fd and is responsible for -// closing it. +// - SCMConnectedEndpoint holds ownership of its fd and notification queue. type SCMConnectedEndpoint struct { ConnectedEndpoint + + queue *waiter.Queue +} + +// Init will do the initialization required without holding other locks. +func (e *SCMConnectedEndpoint) Init() error { + return fdnotifier.AddFD(int32(e.fd), e.queue) } // Release implements transport.ConnectedEndpoint.Release and @@ -368,6 +362,7 @@ func (e *SCMConnectedEndpoint) Release() { if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) } + fdnotifier.RemoveFD(int32(e.fd)) e.destroyLocked() e.mu.Unlock() }) @@ -380,11 +375,13 @@ func (e *SCMConnectedEndpoint) Release() { // be called twice because ConnectedEndpoint is both a transport.Receiver and // transport.ConnectedEndpoint. func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) { - e := SCMConnectedEndpoint{ConnectedEndpoint{ - fd: hostFD, - addr: addr, + e := SCMConnectedEndpoint{ + ConnectedEndpoint: ConnectedEndpoint{ + fd: hostFD, + addr: addr, + }, queue: queue, - }} + } if err := e.init(); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 8936afb06..68af6e5af 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -26,15 +26,15 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor -// that wraps a TTY FD. -type ttyFD struct { +// TTYFileDescription implements vfs.FileDescriptionImpl for a host file +// descriptor that wraps a TTY FD. +type TTYFileDescription struct { fileDescription // mu protects the fields below. mu sync.Mutex `state:"nosave"` - // session is the session attached to this ttyFD. + // session is the session attached to this TTYFileDescription. session *kernel.Session // fgProcessGroup is the foreground process group that is currently @@ -48,7 +48,7 @@ type ttyFD struct { // InitForegroundProcessGroup sets the foreground process group and session for // the TTY. This should only be called once, after the foreground process group // has been created, but before it has started running. -func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { +func (t *TTYFileDescription) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { t.mu.Lock() defer t.mu.Unlock() if t.fgProcessGroup != nil { @@ -59,14 +59,14 @@ func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { } // ForegroundProcessGroup returns the foreground process for the TTY. -func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup { +func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { t.mu.Lock() defer t.mu.Unlock() return t.fgProcessGroup } // Release implements fs.FileOperations.Release. -func (t *ttyFD) Release() { +func (t *TTYFileDescription) Release() { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() @@ -78,7 +78,7 @@ func (t *ttyFD) Release() { // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. -func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { +func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -96,7 +96,7 @@ func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. -func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { +func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -111,7 +111,7 @@ func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadO } // PWrite implements vfs.FileDescriptionImpl. -func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { +func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -126,7 +126,7 @@ func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64 } // Write implements vfs.FileDescriptionImpl. -func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { +func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -141,7 +141,7 @@ func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.Writ } // Ioctl implements vfs.FileDescriptionImpl. -func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // Ignore arg[0]. This is the real FD: fd := t.inode.hostFD ioctl := args[1].Uint64() @@ -321,7 +321,7 @@ func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgum // is a bit convoluted, but documented inline. // // Preconditions: t.mu must be held. -func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error { +func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) error { task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index dd5806301..8284e76a7 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -36,13 +37,20 @@ import ( // inode. // // Must be initialize with Init before first use. +// +// Lock ordering: mu => children.mu. type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl vfsfd vfs.FileDescription children *OrderedChildren - off int64 + + // mu protects the fields below. + mu sync.Mutex + + // off is the current directory offset. Protected by "mu". + off int64 } // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its @@ -115,17 +123,13 @@ func (fd *GenericDirectoryFD) inode() Inode { // IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds // o.mu when calling cb. func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - vfsFS := fd.filesystem() - fs := vfsFS.Impl().(*Filesystem) - vfsd := fd.vfsfd.VirtualDentry().Dentry() - - fs.mu.Lock() - defer fs.mu.Unlock() + fd.mu.Lock() + defer fd.mu.Unlock() opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { - stat, err := fd.inode().Stat(vfsFS, opts) + stat, err := fd.inode().Stat(fd.filesystem(), opts) if err != nil { return err } @@ -143,8 +147,9 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { + vfsd := fd.vfsfd.VirtualDentry().Dentry() parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode - stat, err := parentInode.Stat(vfsFS, opts) + stat, err := parentInode.Stat(fd.filesystem(), opts) if err != nil { return err } @@ -168,7 +173,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { inode := it.Dentry.Impl().(*Dentry).inode - stat, err := inode.Stat(vfsFS, opts) + stat, err := inode.Stat(fd.filesystem(), opts) if err != nil { return err } @@ -192,9 +197,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Seek implements vfs.FileDecriptionImpl.Seek. func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fs := fd.filesystem().Impl().(*Filesystem) - fs.mu.Lock() - defer fs.mu.Unlock() + fd.mu.Lock() + defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 9e8d80414..4a12ae245 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -766,14 +766,17 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs() if err != nil { return nil, err } + if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } return nil, syserror.ECONNREFUSED } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 732837933..a83151ad3 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -132,13 +132,6 @@ func (fs *Filesystem) processDeferredDecRefsLocked() { fs.droppedDentriesMu.Unlock() } -// Init initializes a kernfs filesystem. This should be called from during -// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding -// kernfs. -func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem, fsType vfs.FilesystemType) { - fs.vfsfs.Init(vfsObj, fsType, fs) -} - // VFSFilesystem returns the generic vfs filesystem object. func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { return &fs.vfsfs @@ -261,6 +254,11 @@ func (d *Dentry) insertChildLocked(name string, child *Dentry) { d.children[name] = child } +// Inode returns the dentry's inode. +func (d *Dentry) Inode() Inode { + return d.inode +} + // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index a9f671bc8..1c5d3e7e7 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -195,7 +195,7 @@ func (fsType) Name() string { func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Init(vfsObj, &fst) + fs.VFSFilesystem().Init(vfsObj, &fst, fs) root := fst.rootFn(creds, fs) return fs.VFSFilesystem(), root.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index d6bd67467..5375e5e75 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -40,25 +40,19 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile panic("pipefs.filesystemType.GetFilesystem should never be called") } -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - kernfs.Filesystem - - // TODO(gvisor.dev/issue/1193): - // - // - kernfs does not provide a way to implement statfs, from which we - // should indicate PIPEFS_MAGIC. - // - // - kernfs does not provide a way to override names for - // vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic - // name fmt.Sprintf("pipe:[%d]", inode.ino). -} +// TODO(gvisor.dev/issue/1193): +// +// - kernfs does not provide a way to implement statfs, from which we +// should indicate PIPEFS_MAGIC. +// +// - kernfs does not provide a way to override names for +// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic +// name fmt.Sprintf("pipe:[%d]", inode.ino). -// NewFilesystem sets up and returns a new vfs.Filesystem implemented by -// pipefs. +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} - fs.Init(vfsObj, filesystemType{}) + fs := &kernfs.Filesystem{} + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.VFSFilesystem() } diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6595fcee6..9c329341a 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -511,6 +511,8 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // degrade gracefully and retrieve what we can. t := kernel.TaskFromContext(ctx) + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops \n") + for _, se := range d.kernel.ListSockets() { s := se.SockVFS2 if !s.TryIncRef() { diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 92007df81..e5f13b69e 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -272,12 +272,16 @@ func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { inactiveFile := file - activeFile fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // We use MemFree as MemAvailable because we don't swap. // TODO(rahat): When reclaim is implemented the value of MemAvailable // should change. - fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree/1024) + fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree/1024) fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) // Emulate a system with no swap, which disables inactivation of anon pages. diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index f08668ca2..0e90e02fe 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -112,9 +112,7 @@ func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) } } - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents), - }) + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 271134af8..239a9f4b4 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -41,26 +41,19 @@ func (filesystemType) Name() string { return "sockfs" } -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - kernfs.Filesystem -} - // NewFilesystem sets up and returns a new sockfs filesystem. // // Note that there should only ever be one instance of sockfs.Filesystem, // backing a global socket mount. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} - fs.Init(vfsObj, filesystemType{}) + fs := &kernfs.Filesystem{} + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.VFSFilesystem() } // inode implements kernfs.Inode. // -// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are -// not included in InodeAttrs) to store the numbers of the appropriate -// socket device. Override InodeAttrs.Stat() accordingly. +// TODO(gvisor.dev/issue/1193): Device numbers. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index f8d25d35e..00f7d6214 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -47,7 +47,7 @@ func (FilesystemType) Name() string { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Filesystem.Init(vfsObj, &fsType) + fs.VFSFilesystem().Init(vfsObj, &fsType, fs) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() defaultSysDirMode := linux.FileMode(0755) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5b62f9ebb..36ffcb592 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -697,13 +697,16 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(rp) if err != nil { return nil, err } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } switch impl := d.inode.impl.(type) { case *socketFile: return impl.ep, nil diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 84156d5a1..413111faf 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -29,7 +29,7 @@ import ( // // The types below create fast lookup slices for all syscalls. This maximum // serves as a sanity check that we don't allocate huge slices for a very large -// syscall. +// syscall. This is checked during registration. const maxSyscallNum = 2000 // SyscallSupportLevel is a syscall support levels. @@ -266,6 +266,16 @@ type SyscallTable struct { FeatureEnable SyscallFlagsTable } +// MaxSysno returns the largest system call number. +func (s *SyscallTable) MaxSysno() (max uintptr) { + for num := range s.Table { + if num > max { + max = num + } + } + return max +} + // allSyscallTables contains all known tables. var allSyscallTables []*SyscallTable @@ -286,6 +296,20 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { // RegisterSyscallTable registers a new syscall table for use by a Kernel. func RegisterSyscallTable(s *SyscallTable) { + if max := s.MaxSysno(); max > maxSyscallNum { + panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) + } + if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { + panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) + } + allSyscallTables = append(allSyscallTables, s) + s.Init() +} + +// Init initializes the system call table. +// +// This should normally be called only during registration. +func (s *SyscallTable) Init() { if s.Table == nil { // Ensure non-nil lookup table. s.Table = make(map[uintptr]Syscall) @@ -295,42 +319,16 @@ func RegisterSyscallTable(s *SyscallTable) { s.Emulate = make(map[usermem.Addr]uintptr) } - var max uintptr - for num := range s.Table { - if num > max { - max = num - } - } - - if max > maxSyscallNum { - panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) - } - - s.lookup = make([]SyscallFn, max+1) + max := s.MaxSysno() // Checked during RegisterSyscallTable. // Initialize the fast-lookup table. + s.lookup = make([]SyscallFn, max+1) for num, sc := range s.Table { s.lookup[num] = sc.Fn } + // Initialize all features. s.FeatureEnable.init(s.Table, max) - - if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { - panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) - } - - // Save a reference to this table. - // - // This is required for a Kernel to find the table and for save/restore - // operations below. - allSyscallTables = append(allSyscallTables, s) -} - -// FlushSyscallTablesTestOnly flushes the syscall tables for tests. Used for -// parameterized VFSv2 tests. -// TODO(gvisor.dv/issue/1624): Remove when VFS1 is no longer supported. -func FlushSyscallTablesTestOnly() { - allSyscallTables = nil } // Lookup returns the syscall implementation, if one exists. diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index dc99301de..da0ea7bb5 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -16,6 +16,7 @@ package kernel import ( "fmt" + "sync/atomic" "time" "gvisor.dev/gvisor/pkg/log" @@ -48,6 +49,9 @@ type Timekeeper struct { // It is set only once, by SetClocks. monotonicOffset int64 `state:"nosave"` + // monotonicLowerBound is the lowerBound for monotonic time. + monotonicLowerBound int64 `state:"nosave"` + // restored, if non-nil, indicates that this Timekeeper was restored // from a state file. The clocks are not set until restored is closed. restored chan struct{} `state:"nosave"` @@ -271,6 +275,21 @@ func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) { now, err := t.clocks.GetTime(c) if err == nil && c == sentrytime.Monotonic { now += t.monotonicOffset + for { + // It's possible that the clock is shaky. This may be due to + // platform issues, e.g. the KVM platform relies on the guest + // TSC and host TSC, which may not be perfectly in sync. To + // work around this issue, ensure that the monotonic time is + // always bounded by the last time read. + oldLowerBound := atomic.LoadInt64(&t.monotonicLowerBound) + if now < oldLowerBound { + now = oldLowerBound + break + } + if atomic.CompareAndSwapInt64(&t.monotonicLowerBound, oldLowerBound, now) { + break + } + } } return now, err } diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 577e9306a..2b11ea4ae 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -180,6 +180,11 @@ type MemoryFileOpts struct { // notifications to determine when eviction is necessary. This option has // no effect unless DelayedEviction is DelayedEvictionEnabled. UseHostMemcgPressure bool + + // If ManualZeroing is true, MemoryFile must not assume that new pages + // obtained from the host are zero-filled, such that MemoryFile must manually + // zero newly-allocated pages. + ManualZeroing bool } // DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. @@ -432,6 +437,15 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Mark selected pages as in use. fr := platform.FileRange{start, end} + if f.opts.ManualZeroing { + if err := f.forEachMappingSlice(fr, func(bs []byte) { + for i := range bs { + bs[i] = 0 + } + }); err != nil { + return platform.FileRange{}, err + } + } if !f.usage.Add(fr, usageInfo{ kind: kind, refs: 1, diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 023bad156..deedd35f7 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -10,6 +10,7 @@ go_library( "save_restore.go", "socket.go", "socket_unsafe.go", + "socket_vfs2.go", "sockopt_impl.go", "stack.go", ], @@ -25,11 +26,15 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/hostfd", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/control", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 22f78d2e2..b49433326 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -49,6 +50,8 @@ const ( maxControlLen = 1024 ) +// LINT.IfChange + // socketOperations implements fs.FileOperations and socket.Socket for a socket // implemented using a host socket. type socketOperations struct { @@ -59,23 +62,37 @@ type socketOperations struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout family int // Read-only. stype linux.SockType // Read-only. protocol int // Read-only. - fd int // must be O_NONBLOCK queue waiter.Queue + + // fd is the host socket fd. It must have O_NONBLOCK, so that operations + // will return EWOULDBLOCK instead of blocking on the host. This allows us to + // handle blocking behavior independently in the sentry. + fd int } var _ = socket.Socket(&socketOperations{}) func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { s := &socketOperations{ - family: family, - stype: stype, - protocol: protocol, - fd: fd, + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, } if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) @@ -86,28 +103,33 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc } // Release implements fs.FileOperations.Release. -func (s *socketOperations) Release() { +func (s *socketOpsCommon) Release() { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } // Readiness implements waiter.Waitable.Readiness. -func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(s.fd), mask) } // EventRegister implements waiter.Waitable.EventRegister. -func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.queue.EventRegister(e, mask) fdnotifier.UpdateFD(int32(s.fd)) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *socketOperations) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.queue.EventUnregister(e) fdnotifier.UpdateFD(int32(s.fd)) } +// Ioctl implements fs.FileOperations.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, io, args) +} + // Read implements fs.FileOperations.Read. func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { @@ -155,7 +177,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } // Connect implements socket.Socket.Connect. -func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -195,7 +217,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { var peerAddr linux.SockAddr var peerAddrBuf []byte var peerAddrlen uint32 @@ -209,7 +231,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } // Conservatively ignore all flags specified by the application and add - // SOCK_NONBLOCK since socketOperations requires it. + // SOCK_NONBLOCK since socketOpsCommon requires it. fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC) if blocking { var ch chan struct{} @@ -235,23 +257,41 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) } - f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) - if err != nil { - syscall.Close(fd) - return 0, nil, 0, err - } - defer f.DecRef() + var ( + kfd int32 + kerr error + ) + if kernel.VFS2Enabled { + f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&syscall.SOCK_NONBLOCK)) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() - kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ - CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, - }) - t.Kernel().RecordSocket(f) + kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocketVFS2(f) + } else { + f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() + + kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocket(f) + } return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } // Bind implements socket.Socket.Bind. -func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -264,12 +304,12 @@ func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Listen implements socket.Socket.Listen. -func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return syserr.FromError(syscall.Listen(s.fd, backlog)) } // Shutdown implements socket.Socket.Shutdown. -func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { switch how { case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR: return syserr.FromError(syscall.Shutdown(s.fd, how)) @@ -279,7 +319,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } @@ -328,7 +368,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt } // SetSockOpt implements socket.Socket.SetSockOpt. -func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { // Whitelist options and constrain option length. optlen := setSockOptLen(t, level, name) switch level { @@ -374,7 +414,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ } // RecvMsg implements socket.Socket.RecvMsg. -func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Whitelist flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary @@ -496,7 +536,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } // SendMsg implements socket.Socket.SendMsg. -func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Whitelist flags. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { return 0, syserr.ErrInvalidArgument @@ -585,7 +625,7 @@ func translateIOSyscallError(err error) error { } // State implements socket.Socket.State. -func (s *socketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { info := linux.TCPInfo{} buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo) if err != nil { @@ -607,7 +647,7 @@ func (s *socketOperations) State() uint32 { } // Type implements socket.Socket.Type. -func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.stype, s.protocol } @@ -663,8 +703,11 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int return nil, nil, nil } +// LINT.ThenChange(./socket_vfs2.go) + func init() { for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { socket.RegisterProvider(family, &socketProvider{family}) + socket.RegisterProviderVFS2(family, &socketProviderVFS2{}) } } diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index cd67234d2..3f420c2ec 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/syserr" @@ -54,12 +53,11 @@ func writev(fd int, srcs []syscall.Iovec) (uint64, error) { return uint64(n), nil } -// Ioctl implements fs.FileOperations.Ioctl. -func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := uintptr(args[1].Int()); cmd { case syscall.TIOCINQ, syscall.TIOCOUTQ: var val int32 - if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s.fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { return 0, translateIOSyscallError(errno) } var buf [4]byte @@ -93,7 +91,7 @@ func getsockopt(fd int, level, name int, optlen int) ([]byte, error) { } // GetSockName implements socket.Socket.GetSockName. -func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) @@ -104,7 +102,7 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, } // GetPeerName implements socket.Socket.GetPeerName. -func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go new file mode 100644 index 000000000..a8278bffc --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -0,0 +1,186 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +type socketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + // We store metadata for hostinet sockets internally. Technically, we should + // access metadata (e.g. through stat, chmod) on the host for correctness, + // but this is not very useful for inet socket fds, which do not belong to a + // concrete file anyway. + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&socketVFS2{}) + +func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + + s := &socketVFS2{ + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, + } + if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { + return nil, syserr.FromError(err) + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) + return int64(n), err +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) + return int64(n), err +} + +type socketProviderVFS2 struct { + family int +} + +// Socket implements socket.ProviderVFS2.Socket. +func (p *socketProviderVFS2) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := stypeflags & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newVFS2Socket(t, p.family, stype, protocol, fd, uint32(stypeflags&syscall.SOCK_NONBLOCK)) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProviderVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 878f81fd5..40736fb38 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -53,9 +53,14 @@ type metadata struct { Size uint32 } +// enableLogging controls whether to log the (de)serialization of netfilter +// structs between userspace and netstack. These logs are useful when +// developing iptables, but can pollute sentry logs otherwise. +const enableLogging = false + // nflog logs messages related to the writing and reading of iptables. func nflog(format string, args ...interface{}) { - if log.IsLogging(log.Debug) { + if enableLogging && log.IsLogging(log.Debug) { log.Debugf("netfilter: "+format, args...) } } @@ -246,7 +251,7 @@ func marshalTarget(target stack.Target) []byte { case stack.ReturnTarget: return marshalStandardTarget(stack.RuleReturn) case stack.RedirectTarget: - return marshalRedirectTarget() + return marshalRedirectTarget(tg) case JumpTarget: return marshalJumpTarget(tg) default: @@ -283,7 +288,7 @@ func marshalErrorTarget(errorName string) []byte { return binary.Marshal(ret, usermem.ByteOrder, target) } -func marshalRedirectTarget() []byte { +func marshalRedirectTarget(rt stack.RedirectTarget) []byte { // This is a redirect target named redirect target := linux.XTRedirectTarget{ Target: linux.XTEntryTarget{ @@ -293,6 +298,16 @@ func marshalRedirectTarget() []byte { copy(target.Target.Name[:], redirectTargetName) ret := make([]byte, 0, linux.SizeOfXTRedirectTarget) + target.NfRange.RangeSize = 1 + if rt.RangeProtoSpecified { + target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED + } + // Convert port from little endian to big endian. + port := make([]byte, 2) + binary.LittleEndian.PutUint16(port, rt.MinPort) + target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port) + binary.LittleEndian.PutUint16(port, rt.MaxPort) + target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port) return binary.Marshal(ret, usermem.ByteOrder, target) } diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go index c948de876..84abe8d29 100644 --- a/pkg/sentry/socket/netfilter/targets.go +++ b/pkg/sentry/socket/netfilter/targets.go @@ -15,6 +15,7 @@ package netfilter import ( + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -29,6 +30,6 @@ type JumpTarget struct { } // Action implements stack.Target.Action. -func (jt JumpTarget) Action(stack.PacketBuffer) (stack.RuleVerdict, int) { +func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) { return stack.RuleJump, jt.RuleNum } diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index ff1cfd8f6..57a1e1c12 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -120,13 +120,27 @@ func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa if pkt.TransportHeader != nil { tcpHeader = header.TCP(pkt.TransportHeader) } else { + var length int + if hook == stack.Prerouting { + // The network header hasn't been parsed yet. We have to do it here. + hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) + if !ok { + // There's no valid TCP header here, so we hotdrop the + // packet. + return false, true + } + h := header.IPv4(hdr) + pkt.NetworkHeader = hdr + length = int(h.HeaderLength()) + } // The TCP header hasn't been parsed yet. We have to do it here. - if len(pkt.Data.First()) < header.TCPMinimumSize { + hdr, ok := pkt.Data.PullUp(length + header.TCPMinimumSize) + if !ok { // There's no valid TCP header here, so we hotdrop the // packet. return false, true } - tcpHeader = header.TCP(pkt.Data.First()) + tcpHeader = header.TCP(hdr[length:]) } // Check whether the source and destination ports are within the diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index 3359418c1..cfa9e621d 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -119,13 +119,27 @@ func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceNa if pkt.TransportHeader != nil { udpHeader = header.UDP(pkt.TransportHeader) } else { + var length int + if hook == stack.Prerouting { + // The network header hasn't been parsed yet. We have to do it here. + hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) + if !ok { + // There's no valid UDP header here, so we hotdrop the + // packet. + return false, true + } + h := header.IPv4(hdr) + pkt.NetworkHeader = hdr + length = int(h.HeaderLength()) + } // The UDP header hasn't been parsed yet. We have to do it here. - if len(pkt.Data.First()) < header.UDPMinimumSize { + hdr, ok := pkt.Data.PullUp(length + header.UDPMinimumSize) + if !ok { // There's no valid UDP header here, so we hotdrop the // packet. return false, true } - udpHeader = header.UDP(pkt.Data.First()) + udpHeader = header.UDP(hdr[length:]) } // Check whether the source and destination ports are within the diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 1911cd9b8..09ca00a4a 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "message.go", "provider.go", + "provider_vfs2.go", "socket.go", + "socket_vfs2.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -18,6 +20,8 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -25,6 +29,7 @@ go_library( "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index b0dc70e5c..0d45e5053 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -67,6 +67,8 @@ func RegisterProvider(protocol int, provider Provider) { protocols[protocol] = provider } +// LINT.IfChange + // socketProvider implements socket.Provider. type socketProvider struct { } @@ -105,7 +107,10 @@ func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.Fi return nil, nil, syserr.ErrNotSupported } +// LINT.ThenChange(./provider_vfs2.go) + // init registers the socket provider. func init() { socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{}) + socket.RegisterProviderVFS2(linux.AF_NETLINK, &socketProviderVFS2{}) } diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go new file mode 100644 index 000000000..dcd92b5cd --- /dev/null +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -0,0 +1,71 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" +) + +// socketProviderVFS2 implements socket.Provider. +type socketProviderVFS2 struct { +} + +// Socket implements socket.Provider.Socket. +func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Netlink sockets must be specified as datagram or raw, but they + // behave the same regardless of type. + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { + return nil, syserr.ErrSocketNotSupported + } + + provider, ok := protocols[protocol] + if !ok { + return nil, syserr.ErrProtocolNotSupported + } + + p, err := provider(t) + if err != nil { + return nil, err + } + + s, err := NewVFS2(t, stype, p) + if err != nil { + return nil, err + } + + vfsfd := &s.vfsfd + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Pair implements socket.Provider.Pair by returning an error. +func (*socketProviderVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Netlink sockets never supports creating socket pairs. + return nil, nil, syserr.ErrNotSupported +} diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 2ca02567d..81f34c5a2 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -58,6 +58,8 @@ var errNoFilter = syserr.New("no filter attached", linux.ENOENT) // netlinkSocketDevice is the netlink socket virtual device. var netlinkSocketDevice = device.NewAnonDevice() +// LINT.IfChange + // Socket is the base socket type for netlink sockets. // // This implementation only supports userspace sending and receiving messages @@ -74,6 +76,14 @@ type Socket struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout // ports provides netlink port allocation. @@ -140,17 +150,19 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } return &Socket{ - ports: t.Kernel().NetlinkPorts(), - protocol: protocol, - skType: skType, - ep: ep, - connection: connection, - sendBufferSize: defaultSendBufferSize, + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, }, nil } // Release implements fs.FileOperations.Release. -func (s *Socket) Release() { +func (s *socketOpsCommon) Release() { s.connection.Release() s.ep.Close() @@ -160,7 +172,7 @@ func (s *Socket) Release() { } // Readiness implements waiter.Waitable.Readiness. -func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { // ep holds messages to be read and thus handles EventIn readiness. ready := s.ep.Readiness(mask) @@ -174,18 +186,18 @@ func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { } // EventRegister implements waiter.Waitable.EventRegister. -func (s *Socket) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.ep.EventRegister(e, mask) // Writable readiness never changes, so no registration is needed. } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *Socket) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } // Passcred implements transport.Credentialer.Passcred. -func (s *Socket) Passcred() bool { +func (s *socketOpsCommon) Passcred() bool { s.mu.Lock() passcred := s.passcred s.mu.Unlock() @@ -193,7 +205,7 @@ func (s *Socket) Passcred() bool { } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. -func (s *Socket) ConnectedPasscred() bool { +func (s *socketOpsCommon) ConnectedPasscred() bool { // This socket is connected to the kernel, which doesn't need creds. // // This is arbitrary, as ConnectedPasscred on this type has no callers. @@ -227,7 +239,7 @@ func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { // port of 0 defaults to the ThreadGroup ID. // // Preconditions: mu is held. -func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { +func (s *socketOpsCommon) bindPort(t *kernel.Task, port int32) *syserr.Error { if s.bound { // Re-binding is only allowed if the port doesn't change. if port != s.portID { @@ -251,7 +263,7 @@ func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { } // Bind implements socket.Socket.Bind. -func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err @@ -269,7 +281,7 @@ func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Connect implements socket.Socket.Connect. -func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err @@ -300,25 +312,25 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr } // Accept implements socket.Socket.Accept. -func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } // Listen implements socket.Socket.Listen. -func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { // Netlink sockets never support listen. return syserr.ErrNotSupported } // Shutdown implements socket.Socket.Shutdown. -func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { // Netlink sockets never support shutdown. return syserr.ErrNotSupported } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: switch name { @@ -369,7 +381,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem. } // SetSockOpt implements socket.Socket.SetSockOpt. -func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: switch name { @@ -466,7 +478,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy } // GetSockName implements socket.Socket.GetSockName. -func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { s.mu.Lock() defer s.mu.Unlock() @@ -478,7 +490,7 @@ func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er } // GetPeerName implements socket.Socket.GetPeerName. -func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { sa := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, // TODO(b/68878065): Support non-kernel peers. For now the peer @@ -489,7 +501,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er } // RecvMsg implements socket.Socket.RecvMsg. -func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { from := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, PortID: 0, @@ -590,7 +602,7 @@ func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) var kernelCreds = &kernelSCM{} // sendResponse sends the response messages in ms back to userspace. -func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { +func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { // Linux combines multiple netlink messages into a single datagram. bufs := make([][]byte, 0, len(ms.Messages)) for _, m := range ms.Messages { @@ -666,7 +678,7 @@ func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) { // processMessages handles each message in buf, passing it to the protocol // handler for final handling. -func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { +func (s *socketOpsCommon) processMessages(ctx context.Context, buf []byte) *syserr.Error { for len(buf) > 0 { msg, rest, ok := ParseMessage(buf) if !ok { @@ -698,7 +710,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error } // sendMsg is the core of message send, used for SendMsg and Write. -func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { dstPort := int32(0) if len(to) != 0 { @@ -745,7 +757,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, } // SendMsg implements socket.Socket.SendMsg. -func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { return s.sendMsg(t, src, to, flags, controlMessages) } @@ -756,11 +768,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, } // State implements socket.Socket.State. -func (s *Socket) State() uint32 { +func (s *socketOpsCommon) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. -func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return linux.AF_NETLINK, s.skType, s.protocol.Protocol() } + +// LINT.ThenChange(./socket_vfs2.go) diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go new file mode 100644 index 000000000..b854bf990 --- /dev/null +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -0,0 +1,138 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 is the base VFS2 socket type for netlink sockets. +// +// This implementation only supports userspace sending and receiving messages +// to/from the kernel. +// +// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ socket.SocketVFS2 = (*SocketVFS2)(nil) +var _ transport.Credentialer = (*SocketVFS2)(nil) + +// NewVFS2 creates a new SocketVFS2. +func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketVFS2, *syserr.Error) { + // Datagram endpoint used to buffer kernel -> user messages. + ep := transport.NewConnectionless(t) + + // Bind the endpoint for good measure so we can connect to it. The + // bound address will never be exposed. + if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { + ep.Close() + return nil, err + } + + // Create a connection from which the kernel can write messages. + connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) + if err != nil { + ep.Close() + return nil, err + } + + return &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, + }, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) { + // TODO(b/68878065): no ioctls supported. + return 0, syserror.ENOTTY +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &unix.EndpointReader{ + Endpoint: s.ep, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) + return int64(n), err.ToError() +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index cbf46b1e9..ccf9fcf5c 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "device.go", "netstack.go", + "netstack_vfs2.go", "provider.go", + "provider_vfs2.go", "save_restore.go", "stack.go", ], @@ -25,6 +27,8 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -32,6 +36,7 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index d5879c10f..81053d8ef 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -252,6 +252,8 @@ type commonEndpoint interface { GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) } +// LINT.IfChange + // SocketOperations encapsulates all the state needed to represent a network stack // endpoint in the kernel context. // @@ -263,6 +265,14 @@ type SocketOperations struct { fsutil.FileNoFsync `state:"nosave"` fsutil.FileNoMMap `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout *waiter.Queue @@ -314,11 +324,13 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue dirent := socket.NewDirent(t, netstackDevice) defer dirent.DecRef() return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ - Queue: queue, - family: family, - Endpoint: endpoint, - skType: skType, - protocol: protocol, + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, }), nil } @@ -417,7 +429,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { } } -func (s *SocketOperations) isPacketBased() bool { +func (s *socketOpsCommon) isPacketBased() bool { return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW } @@ -425,7 +437,7 @@ func (s *SocketOperations) isPacketBased() bool { // empty. It assumes that the socket is locked. // // Precondition: s.readMu must be held. -func (s *SocketOperations) fetchReadView() *syserr.Error { +func (s *socketOpsCommon) fetchReadView() *syserr.Error { if len(s.readView) > 0 { return nil } @@ -446,7 +458,7 @@ func (s *SocketOperations) fetchReadView() *syserr.Error { } // Release implements fs.FileOperations.Release. -func (s *SocketOperations) Release() { +func (s *socketOpsCommon) Release() { s.Endpoint.Close() } @@ -633,7 +645,7 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader } // Readiness returns a mask of ready events for socket s. -func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { r := s.Endpoint.Readiness(mask) // Check our cached value iff the caller asked for readability and the @@ -647,7 +659,7 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { return r } -func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error { +func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { if family == uint16(s.family) { return nil } @@ -670,7 +682,7 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error // represented by the empty string. // // TODO(gvisor.dev/issue/1556): remove this function. -func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { +func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00" } @@ -679,7 +691,7 @@ func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpi // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. -func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { addr, family, err := AddressAndFamily(sockaddr) if err != nil { return err @@ -725,7 +737,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) < 2 { return syserr.ErrInvalidArgument } @@ -771,13 +783,13 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Listen implements the linux syscall listen(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog)) } // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. -func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { +func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(nil) s.EventRegister(&e, waiter.EventIn) @@ -863,7 +875,7 @@ func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { // Shutdown implements the linux syscall shutdown(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := ConvertShutdown(how) if err != nil { return err @@ -2258,7 +2270,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2270,7 +2282,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2285,7 +2297,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, // caller. // // Precondition: s.readMu must be locked. -func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { +func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { var err *syserr.Error var copied int @@ -2337,7 +2349,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq return 0, err } -func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { +func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) { if !s.sockOptInq { return } @@ -2352,7 +2364,7 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. -func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { isPacket := s.isPacketBased() // Fast path for regular reads from stream (e.g., TCP) endpoints. Note @@ -2461,7 +2473,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe return n, flags, addr, addrLen, cmsg, syserr.FromError(err) } -func (s *SocketOperations) controlMessages() socket.ControlMessages { +func (s *socketOpsCommon) controlMessages() socket.ControlMessages { return socket.ControlMessages{ IP: tcpip.ControlMessages{ HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, @@ -2480,7 +2492,7 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages { // successfully writing packet data out to userspace. // // Precondition: s.readMu must be locked. -func (s *SocketOperations) updateTimestamp() { +func (s *socketOpsCommon) updateTimestamp() { // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. if !s.sockOptTimestamp { s.timestampValid = true @@ -2490,7 +2502,7 @@ func (s *SocketOperations) updateTimestamp() { // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 @@ -2558,7 +2570,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // SendMsg implements the linux syscall sendmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Reject Unix control messages. if !controlMessages.Unix.Empty() { return 0, syserr.ErrInvalidArgument @@ -2634,6 +2646,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] // Ioctl implements fs.FileOperations.Ioctl. func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, io, args) +} + +func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. @@ -2973,7 +2989,7 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { // State implements socket.Socket.State. State translates the internal state // returned by netstack to values defined by Linux. -func (s *SocketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { if s.family != linux.AF_INET && s.family != linux.AF_INET6 { // States not implemented for this socket's family. return 0 @@ -3033,6 +3049,8 @@ func (s *SocketOperations) State() uint32 { } // Type implements socket.Socket.Type. -func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.skType, s.protocol } + +// LINT.ThenChange(./netstack_vfs2.go) diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go new file mode 100644 index 000000000..f7d9b2ff4 --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -0,0 +1,330 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 encapsulates all the state needed to represent a network stack +// endpoint in the kernel context. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&SocketVFS2{}) + +// NewVFS2 creates a new endpoint socket. +func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { + if skType == linux.SOCK_STREAM { + if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + } + + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + + s := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) + if err == syserr.ErrWouldBlock { + return int64(n), syserror.ErrWouldBlock + } + if err != nil { + return 0, err.ToError() + } + return int64(n), nil +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + f := &ioSequencePayload{ctx: ctx, src: src} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + t := kernel.TaskFromContext(ctx) + if err := t.Block(resCh); err != nil { + return 0, syserr.FromError(err).ToError() + } + + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) + } + + if err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + + if int64(n) < src.NumBytes() { + return int64(n), syserror.ErrWouldBlock + } + + return int64(n), nil +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, wq, terr := s.Endpoint.Accept() + if terr != nil { + if terr != tcpip.ErrWouldBlock || !blocking { + return 0, nil, 0, syserr.TranslateNetstackError(terr) + } + + var err *syserr.Error + ep, wq, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := NewVFS2(t, s.family, s.skType, s.protocol, wq, ep) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { + return 0, nil, 0, syserr.FromError(err) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer and write it to peer slice. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + + t.Kernel().RecordSocketVFS2(ns) + + return fd, addr, addrLen, syserr.FromError(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, uio, args) +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptTimestamp { + val = 1 + } + return val, nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptInq { + val = 1 + } + return val, nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) + if err != nil { + return nil, err + } + return info, nil + + case linux.IPT_SO_GET_ENTRIES: + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return entries, nil + + } + } + + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIPTReplace { + return syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal) + + case linux.IPT_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + } + } + + return SetSockOpt(t, s, s.Endpoint, level, name, optVal) +} diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index c3f04b613..ead3b2b79 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -33,6 +33,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // provider is an inet socket provider. type provider struct { family int @@ -167,6 +169,8 @@ func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol return New(t, linux.AF_PACKET, stype, protocol, wq, ep) } +// LINT.ThenChange(./provider_vfs2.go) + // Pair just returns nil sockets (not supported). func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go new file mode 100644 index 000000000..2a01143f6 --- /dev/null +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -0,0 +1,141 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/waiter" +) + +// providerVFS2 is an inet socket provider. +type providerVFS2 struct { + family int + netProto tcpip.NetworkProtocolNumber +} + +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. +func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Fail right away if we don't have a stack. + stack := t.NetworkContext() + if stack == nil { + // Don't propagate an error here. Instead, allow the socket + // code to continue searching for another provider. + return nil, nil + } + eps, ok := stack.(*Stack) + if !ok { + return nil, nil + } + + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocketVFS2(t, eps, stype, protocol) + } + + // Figure out the transport protocol. + transProto, associated, err := getTransportProtocol(t, stype, protocol) + if err != nil { + return nil, err + } + + // Create the endpoint. + var ep tcpip.Endpoint + var e *tcpip.Error + wq := &waiter.Queue{} + if stype == linux.SOCK_RAW { + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) + } else { + ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } + } + if e != nil { + return nil, syserr.TranslateNetstackError(e) + } + + return NewVFS2(t, p.family, stype, int(transProto), wq, ep) +} + +func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return NewVFS2(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + +// Pair just returns nil sockets (not supported). +func (*providerVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + return nil, nil, nil +} + +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. +func init() { + // Providers backed by netstack. + p := []providerVFS2{ + { + family: linux.AF_INET, + netProto: ipv4.ProtocolNumber, + }, + { + family: linux.AF_INET6, + netProto: ipv6.ProtocolNumber, + }, + { + family: linux.AF_PACKET, + }, + } + + for i := range p { + socket.RegisterProviderVFS2(p[i].family, &p[i]) + } +} diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 7c64f30fa..5b29e9d7f 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -103,7 +103,7 @@ func (s *socketOpsCommon) DecRef() { } // Release implemements fs.FileOperations.Release. -func (s *SocketOperations) Release() { +func (s *socketOpsCommon) Release() { // Release only decrements a reference on s because s may be referenced in // the abstract socket namespace. s.DecRef() @@ -323,7 +323,10 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Create the socket. // - // TODO(gvisor.dev/issue/2324): Correctly set file permissions. + // Note that the file permissions here are not set correctly (see + // gvisor.dev/issue/2324). There is no convenient way to get permissions + // on the socket referred to by s, so we will leave this discrepancy + // unresolved until VFS2 replaces this code. childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}) if err != nil { return syserr.ErrPortInUse @@ -373,7 +376,7 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, Path: p, FollowFinalSymlink: true, } - ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop) + ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) root.DecRef() if relPath { start.DecRef() diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 315dc274f..06d838868 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -43,6 +44,8 @@ type SocketVFS2 struct { socketOpsCommon } +var _ = socket.SocketVFS2(&SocketVFS2{}) + // NewSockfsFile creates a new socket file in the global sockfs mount and // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { @@ -197,11 +200,13 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { Start: start, Path: path, } - err := t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ - // TODO(gvisor.dev/issue/2324): The file permissions should be taken - // from s and t.FSContext().Umask() (see net/unix/af_unix.c:unix_bind), - // but VFS1 just always uses 0400. Resolve this inconsistency. - Mode: linux.S_IFSOCK | 0400, + stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE}) + if err != nil { + return syserr.FromError(err) + } + err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ + // File permissions correspond to net/unix/af_unix.c:unix_bind. + Mode: linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()), Endpoint: bep, }) if err == syserror.EEXIST { @@ -227,7 +232,7 @@ func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset i // Read implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. - // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } @@ -252,7 +257,7 @@ func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset // Write implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. - // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } @@ -273,13 +278,6 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs }) } -// Release implements vfs.FileDescriptionImpl. -func (s *SocketVFS2) Release() { - // Release only decrements a reference on s because s may be referenced in - // the abstract socket namespace. - s.DecRef() -} - // Readiness implements waiter.Waitable.Readiness. func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { return s.socketOpsCommon.Readiness(mask) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 0d24fd3c4..245e8fe1e 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -8,8 +8,6 @@ go_library( "error.go", "flags.go", "linux64.go", - "linux64_amd64.go", - "linux64_arm64.go", "sigset.go", "sys_aio.go", "sys_capability.go", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 68589a377..ea4f9b1a7 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -15,6 +15,16 @@ // Package linux provides syscall tables for amd64 Linux. package linux +import ( + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + const ( // LinuxSysname is the OS name advertised by gVisor. LinuxSysname = "Linux" @@ -25,3 +35,702 @@ const ( // LinuxVersion is the version info advertised by gVisor. LinuxVersion = "#1 SMP Sun Jan 10 15:06:54 PST 2016" ) + +// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var AMD64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.AMD64, + Version: kernel.Version{ + // Version 4.4 is chosen as a stable, longterm version of Linux, which + // guides the interface provided by this syscall table. The build + // version is that for a clean build with default kernel config, at 5 + // minutes after v4.4 was tagged. + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_X86_64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.Supported("read", Read), + 1: syscalls.Supported("write", Write), + 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), + 3: syscalls.Supported("close", Close), + 4: syscalls.Supported("stat", Stat), + 5: syscalls.Supported("fstat", Fstat), + 6: syscalls.Supported("lstat", Lstat), + 7: syscalls.Supported("poll", Poll), + 8: syscalls.Supported("lseek", Lseek), + 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 10: syscalls.Supported("mprotect", Mprotect), + 11: syscalls.Supported("munmap", Munmap), + 12: syscalls.Supported("brk", Brk), + 13: syscalls.Supported("rt_sigaction", RtSigaction), + 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 15: syscalls.Supported("rt_sigreturn", RtSigreturn), + 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 17: syscalls.Supported("pread64", Pread64), + 18: syscalls.Supported("pwrite64", Pwrite64), + 19: syscalls.Supported("readv", Readv), + 20: syscalls.Supported("writev", Writev), + 21: syscalls.Supported("access", Access), + 22: syscalls.Supported("pipe", Pipe), + 23: syscalls.Supported("select", Select), + 24: syscalls.Supported("sched_yield", SchedYield), + 25: syscalls.Supported("mremap", Mremap), + 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 32: syscalls.Supported("dup", Dup), + 33: syscalls.Supported("dup2", Dup2), + 34: syscalls.Supported("pause", Pause), + 35: syscalls.Supported("nanosleep", Nanosleep), + 36: syscalls.Supported("getitimer", Getitimer), + 37: syscalls.Supported("alarm", Alarm), + 38: syscalls.Supported("setitimer", Setitimer), + 39: syscalls.Supported("getpid", Getpid), + 40: syscalls.Supported("sendfile", Sendfile), + 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 42: syscalls.Supported("connect", Connect), + 43: syscalls.Supported("accept", Accept), + 44: syscalls.Supported("sendto", SendTo), + 45: syscalls.Supported("recvfrom", RecvFrom), + 46: syscalls.Supported("sendmsg", SendMsg), + 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 50: syscalls.Supported("listen", Listen), + 51: syscalls.Supported("getsockname", GetSockName), + 52: syscalls.Supported("getpeername", GetPeerName), + 53: syscalls.Supported("socketpair", SocketPair), + 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 57: syscalls.Supported("fork", Fork), + 58: syscalls.Supported("vfork", Vfork), + 59: syscalls.Supported("execve", Execve), + 60: syscalls.Supported("exit", Exit), + 61: syscalls.Supported("wait4", Wait4), + 62: syscalls.Supported("kill", Kill), + 63: syscalls.Supported("uname", Uname), + 64: syscalls.Supported("semget", Semget), + 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 67: syscalls.Supported("shmdt", Shmdt), + 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 76: syscalls.Supported("truncate", Truncate), + 77: syscalls.Supported("ftruncate", Ftruncate), + 78: syscalls.Supported("getdents", Getdents), + 79: syscalls.Supported("getcwd", Getcwd), + 80: syscalls.Supported("chdir", Chdir), + 81: syscalls.Supported("fchdir", Fchdir), + 82: syscalls.Supported("rename", Rename), + 83: syscalls.Supported("mkdir", Mkdir), + 84: syscalls.Supported("rmdir", Rmdir), + 85: syscalls.Supported("creat", Creat), + 86: syscalls.Supported("link", Link), + 87: syscalls.Supported("unlink", Unlink), + 88: syscalls.Supported("symlink", Symlink), + 89: syscalls.Supported("readlink", Readlink), + 90: syscalls.Supported("chmod", Chmod), + 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 92: syscalls.Supported("chown", Chown), + 93: syscalls.Supported("fchown", Fchown), + 94: syscalls.Supported("lchown", Lchown), + 95: syscalls.Supported("umask", Umask), + 96: syscalls.Supported("gettimeofday", Gettimeofday), + 97: syscalls.Supported("getrlimit", Getrlimit), + 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 100: syscalls.Supported("times", Times), + 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 102: syscalls.Supported("getuid", Getuid), + 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 104: syscalls.Supported("getgid", Getgid), + 105: syscalls.Supported("setuid", Setuid), + 106: syscalls.Supported("setgid", Setgid), + 107: syscalls.Supported("geteuid", Geteuid), + 108: syscalls.Supported("getegid", Getegid), + 109: syscalls.Supported("setpgid", Setpgid), + 110: syscalls.Supported("getppid", Getppid), + 111: syscalls.Supported("getpgrp", Getpgrp), + 112: syscalls.Supported("setsid", Setsid), + 113: syscalls.Supported("setreuid", Setreuid), + 114: syscalls.Supported("setregid", Setregid), + 115: syscalls.Supported("getgroups", Getgroups), + 116: syscalls.Supported("setgroups", Setgroups), + 117: syscalls.Supported("setresuid", Setresuid), + 118: syscalls.Supported("getresuid", Getresuid), + 119: syscalls.Supported("setresgid", Setresgid), + 120: syscalls.Supported("getresgid", Getresgid), + 121: syscalls.Supported("getpgid", Getpgid), + 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 124: syscalls.Supported("getsid", Getsid), + 125: syscalls.Supported("capget", Capget), + 126: syscalls.Supported("capset", Capset), + 127: syscalls.Supported("rt_sigpending", RtSigpending), + 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 131: syscalls.Supported("sigaltstack", Sigaltstack), + 132: syscalls.Supported("utime", Utime), + 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), + 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), + 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), + 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), + 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), + 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), + 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), + 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 161: syscalls.Supported("chroot", Chroot), + 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 170: syscalls.Supported("sethostname", Sethostname), + 171: syscalls.Supported("setdomainname", Setdomainname), + 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), + 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), + 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), + 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), + 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), + 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), + 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), + 186: syscalls.Supported("gettid", Gettid), + 187: syscalls.Supported("readahead", Readahead), + 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), + 200: syscalls.Supported("tkill", Tkill), + 201: syscalls.Supported("time", Time), + 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), + 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 213: syscalls.Supported("epoll_create", EpollCreate), + 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), + 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), + 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 217: syscalls.Supported("getdents64", Getdents64), + 218: syscalls.Supported("set_tid_address", SetTidAddress), + 219: syscalls.Supported("restart_syscall", RestartSyscall), + 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 222: syscalls.Supported("timer_create", TimerCreate), + 223: syscalls.Supported("timer_settime", TimerSettime), + 224: syscalls.Supported("timer_gettime", TimerGettime), + 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 226: syscalls.Supported("timer_delete", TimerDelete), + 227: syscalls.Supported("clock_settime", ClockSettime), + 228: syscalls.Supported("clock_gettime", ClockGettime), + 229: syscalls.Supported("clock_getres", ClockGetres), + 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 231: syscalls.Supported("exit_group", ExitGroup), + 232: syscalls.Supported("epoll_wait", EpollWait), + 233: syscalls.Supported("epoll_ctl", EpollCtl), + 234: syscalls.Supported("tgkill", Tgkill), + 235: syscalls.Supported("utimes", Utimes), + 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), + 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 247: syscalls.Supported("waitid", Waitid), + 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 257: syscalls.Supported("openat", Openat), + 258: syscalls.Supported("mkdirat", Mkdirat), + 259: syscalls.Supported("mknodat", Mknodat), + 260: syscalls.Supported("fchownat", Fchownat), + 261: syscalls.Supported("futimesat", Futimesat), + 262: syscalls.Supported("fstatat", Fstatat), + 263: syscalls.Supported("unlinkat", Unlinkat), + 264: syscalls.Supported("renameat", Renameat), + 265: syscalls.Supported("linkat", Linkat), + 266: syscalls.Supported("symlinkat", Symlinkat), + 267: syscalls.Supported("readlinkat", Readlinkat), + 268: syscalls.Supported("fchmodat", Fchmodat), + 269: syscalls.Supported("faccessat", Faccessat), + 270: syscalls.Supported("pselect", Pselect), + 271: syscalls.Supported("ppoll", Ppoll), + 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 275: syscalls.Supported("splice", Splice), + 276: syscalls.Supported("tee", Tee), + 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 280: syscalls.Supported("utimensat", Utimensat), + 281: syscalls.Supported("epoll_pwait", EpollPwait), + 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 283: syscalls.Supported("timerfd_create", TimerfdCreate), + 284: syscalls.Supported("eventfd", Eventfd), + 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 286: syscalls.Supported("timerfd_settime", TimerfdSettime), + 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 288: syscalls.Supported("accept4", Accept4), + 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 290: syscalls.Supported("eventfd2", Eventfd2), + 291: syscalls.Supported("epoll_create1", EpollCreate1), + 292: syscalls.Supported("dup3", Dup3), + 293: syscalls.Supported("pipe2", Pipe2), + 294: syscalls.Supported("inotify_init1", InotifyInit1), + 295: syscalls.Supported("preadv", Preadv), + 296: syscalls.Supported("pwritev", Pwritev), + 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 302: syscalls.Supported("prlimit64", Prlimit64), + 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 309: syscalls.Supported("getcpu", Getcpu), + 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 317: syscalls.Supported("seccomp", Seccomp), + 318: syscalls.Supported("getrandom", GetRandom), + 319: syscalls.Supported("memfd_create", MemfdCreate), + 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), + 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 322: syscalls.Supported("execveat", Execveat), + 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls implemented after 325 are "backports" from versions + // of Linux after 4.4. + 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 327: syscalls.Supported("preadv2", Preadv2), + 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 332: syscalls.Supported("statx", Statx), + 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{ + 0xffffffffff600000: 96, // vsyscall gettimeofday(2) + 0xffffffffff600400: 201, // vsyscall time(2) + 0xffffffffff600800: 309, // vsyscall getcpu(2) + }, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} + +// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall +// numbers from Linux 4.4. +var ARM64 = &kernel.SyscallTable{ + OS: abi.Linux, + Arch: arch.ARM64, + Version: kernel.Version{ + Sysname: LinuxSysname, + Release: LinuxRelease, + Version: LinuxVersion, + }, + AuditNumber: linux.AUDIT_ARCH_AARCH64, + Table: map[uintptr]kernel.Syscall{ + 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), + 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), + 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), + 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), + 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), + 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), + 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), + 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), + 17: syscalls.Supported("getcwd", Getcwd), + 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), + 19: syscalls.Supported("eventfd2", Eventfd2), + 20: syscalls.Supported("epoll_create1", EpollCreate1), + 21: syscalls.Supported("epoll_ctl", EpollCtl), + 22: syscalls.Supported("epoll_pwait", EpollPwait), + 23: syscalls.Supported("dup", Dup), + 24: syscalls.Supported("dup3", Dup3), + 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), + 26: syscalls.Supported("inotify_init1", InotifyInit1), + 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), + 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), + 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) + 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), + 33: syscalls.Supported("mknodat", Mknodat), + 34: syscalls.Supported("mkdirat", Mkdirat), + 35: syscalls.Supported("unlinkat", Unlinkat), + 36: syscalls.Supported("symlinkat", Symlinkat), + 37: syscalls.Supported("linkat", Linkat), + 38: syscalls.Supported("renameat", Renameat), + 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), + 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), + 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), + 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), + 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), + 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), + 45: syscalls.Supported("truncate", Truncate), + 46: syscalls.Supported("ftruncate", Ftruncate), + 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), + 48: syscalls.Supported("faccessat", Faccessat), + 49: syscalls.Supported("chdir", Chdir), + 50: syscalls.Supported("fchdir", Fchdir), + 51: syscalls.Supported("chroot", Chroot), + 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), + 53: syscalls.Supported("fchmodat", Fchmodat), + 54: syscalls.Supported("fchownat", Fchownat), + 55: syscalls.Supported("fchown", Fchown), + 56: syscalls.Supported("openat", Openat), + 57: syscalls.Supported("close", Close), + 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), + 59: syscalls.Supported("pipe2", Pipe2), + 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations + 61: syscalls.Supported("getdents64", Getdents64), + 62: syscalls.Supported("lseek", Lseek), + 63: syscalls.Supported("read", Read), + 64: syscalls.Supported("write", Write), + 65: syscalls.Supported("readv", Readv), + 66: syscalls.Supported("writev", Writev), + 67: syscalls.Supported("pread64", Pread64), + 68: syscalls.Supported("pwrite64", Pwrite64), + 69: syscalls.Supported("preadv", Preadv), + 70: syscalls.Supported("pwritev", Pwritev), + 71: syscalls.Supported("sendfile", Sendfile), + 72: syscalls.Supported("pselect", Pselect), + 73: syscalls.Supported("ppoll", Ppoll), + 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), + 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) + 77: syscalls.Supported("tee", Tee), + 78: syscalls.Supported("readlinkat", Readlinkat), + 79: syscalls.Supported("fstatat", Fstatat), + 80: syscalls.Supported("fstat", Fstat), + 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), + 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), + 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), + 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), + 85: syscalls.Supported("timerfd_create", TimerfdCreate), + 86: syscalls.Supported("timerfd_settime", TimerfdSettime), + 87: syscalls.Supported("timerfd_gettime", TimerfdGettime), + 88: syscalls.Supported("utimensat", Utimensat), + 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), + 90: syscalls.Supported("capget", Capget), + 91: syscalls.Supported("capset", Capset), + 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), + 93: syscalls.Supported("exit", Exit), + 94: syscalls.Supported("exit_group", ExitGroup), + 95: syscalls.Supported("waitid", Waitid), + 96: syscalls.Supported("set_tid_address", SetTidAddress), + 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), + 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), + 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 101: syscalls.Supported("nanosleep", Nanosleep), + 102: syscalls.Supported("getitimer", Getitimer), + 103: syscalls.Supported("setitimer", Setitimer), + 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), + 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), + 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), + 107: syscalls.Supported("timer_create", TimerCreate), + 108: syscalls.Supported("timer_gettime", TimerGettime), + 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), + 110: syscalls.Supported("timer_settime", TimerSettime), + 111: syscalls.Supported("timer_delete", TimerDelete), + 112: syscalls.Supported("clock_settime", ClockSettime), + 113: syscalls.Supported("clock_gettime", ClockGettime), + 114: syscalls.Supported("clock_getres", ClockGetres), + 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), + 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), + 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), + 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), + 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), + 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), + 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), + 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), + 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), + 124: syscalls.Supported("sched_yield", SchedYield), + 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), + 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), + 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), + 128: syscalls.Supported("restart_syscall", RestartSyscall), + 129: syscalls.Supported("kill", Kill), + 130: syscalls.Supported("tkill", Tkill), + 131: syscalls.Supported("tgkill", Tgkill), + 132: syscalls.Supported("sigaltstack", Sigaltstack), + 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), + 134: syscalls.Supported("rt_sigaction", RtSigaction), + 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), + 136: syscalls.Supported("rt_sigpending", RtSigpending), + 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), + 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), + 139: syscalls.Supported("rt_sigreturn", RtSigreturn), + 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), + 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), + 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), + 143: syscalls.Supported("setregid", Setregid), + 144: syscalls.Supported("setgid", Setgid), + 145: syscalls.Supported("setreuid", Setreuid), + 146: syscalls.Supported("setuid", Setuid), + 147: syscalls.Supported("setresuid", Setresuid), + 148: syscalls.Supported("getresuid", Getresuid), + 149: syscalls.Supported("setresgid", Setresgid), + 150: syscalls.Supported("getresgid", Getresgid), + 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) + 153: syscalls.Supported("times", Times), + 154: syscalls.Supported("setpgid", Setpgid), + 155: syscalls.Supported("getpgid", Getpgid), + 156: syscalls.Supported("getsid", Getsid), + 157: syscalls.Supported("setsid", Setsid), + 158: syscalls.Supported("getgroups", Getgroups), + 159: syscalls.Supported("setgroups", Setgroups), + 160: syscalls.Supported("uname", Uname), + 161: syscalls.Supported("sethostname", Sethostname), + 162: syscalls.Supported("setdomainname", Setdomainname), + 163: syscalls.Supported("getrlimit", Getrlimit), + 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), + 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), + 166: syscalls.Supported("umask", Umask), + 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), + 168: syscalls.Supported("getcpu", Getcpu), + 169: syscalls.Supported("gettimeofday", Gettimeofday), + 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), + 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), + 172: syscalls.Supported("getpid", Getpid), + 173: syscalls.Supported("getppid", Getppid), + 174: syscalls.Supported("getuid", Getuid), + 175: syscalls.Supported("geteuid", Geteuid), + 176: syscalls.Supported("getgid", Getgid), + 177: syscalls.Supported("getegid", Getegid), + 178: syscalls.Supported("gettid", Gettid), + 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), + 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) + 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) + 190: syscalls.Supported("semget", Semget), + 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), + 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), + 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), + 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), + 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), + 197: syscalls.Supported("shmdt", Shmdt), + 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), + 199: syscalls.Supported("socketpair", SocketPair), + 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), + 201: syscalls.Supported("listen", Listen), + 202: syscalls.Supported("accept", Accept), + 203: syscalls.Supported("connect", Connect), + 204: syscalls.Supported("getsockname", GetSockName), + 205: syscalls.Supported("getpeername", GetPeerName), + 206: syscalls.Supported("sendto", SendTo), + 207: syscalls.Supported("recvfrom", RecvFrom), + 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), + 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), + 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), + 211: syscalls.Supported("sendmsg", SendMsg), + 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), + 213: syscalls.Supported("readahead", Readahead), + 214: syscalls.Supported("brk", Brk), + 215: syscalls.Supported("munmap", Munmap), + 216: syscalls.Supported("mremap", Mremap), + 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), + 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), + 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), + 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), + 221: syscalls.Supported("execve", Execve), + 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), + 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), + 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), + 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), + 226: syscalls.Supported("mprotect", Mprotect), + 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), + 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), + 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), + 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), + 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), + 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), + 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), + 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), + 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) + 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), + 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), + 242: syscalls.Supported("accept4", Accept4), + 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), + 260: syscalls.Supported("wait4", Wait4), + 261: syscalls.Supported("prlimit64", Prlimit64), + 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), + 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), + 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), + 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), + 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) + 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), + 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), + 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), + 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), + 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) + 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) + 277: syscalls.Supported("seccomp", Seccomp), + 278: syscalls.Supported("getrandom", GetRandom), + 279: syscalls.Supported("memfd_create", MemfdCreate), + 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), + 281: syscalls.Supported("execveat", Execveat), + 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) + 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), + + // Syscalls after 284 are "backports" from versions of Linux after 4.4. + 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), + 286: syscalls.Supported("preadv2", Preadv2), + 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), + 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), + 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), + 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), + 291: syscalls.Supported("statx", Statx), + 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), + 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), + + // Linux skips ahead to syscall 424 to sync numbers between arches. + 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), + 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), + 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), + 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), + 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), + 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), + 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), + 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), + 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), + 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), + 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), + 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), + }, + Emulate: map[usermem.Addr]uintptr{}, + Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { + t.Kernel().EmitUnimplementedEvent(t) + return 0, syserror.ENOSYS + }, +} + +func init() { + kernel.RegisterSyscallTable(AMD64) + kernel.RegisterSyscallTable(ARM64) +} diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go deleted file mode 100644 index 79066ad2a..000000000 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package linux - -import ( - "gvisor.dev/gvisor/pkg/abi" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall -// numbers from Linux 4.4. -var AMD64 = &kernel.SyscallTable{ - OS: abi.Linux, - Arch: arch.AMD64, - Version: kernel.Version{ - // Version 4.4 is chosen as a stable, longterm version of Linux, which - // guides the interface provided by this syscall table. The build - // version is that for a clean build with default kernel config, at 5 - // minutes after v4.4 was tagged. - Sysname: LinuxSysname, - Release: LinuxRelease, - Version: LinuxVersion, - }, - AuditNumber: linux.AUDIT_ARCH_X86_64, - Table: map[uintptr]kernel.Syscall{ - 0: syscalls.Supported("read", Read), - 1: syscalls.Supported("write", Write), - 2: syscalls.PartiallySupported("open", Open, "Options O_DIRECT, O_NOATIME, O_PATH, O_TMPFILE, O_SYNC are not supported.", nil), - 3: syscalls.Supported("close", Close), - 4: syscalls.Supported("stat", Stat), - 5: syscalls.Supported("fstat", Fstat), - 6: syscalls.Supported("lstat", Lstat), - 7: syscalls.Supported("poll", Poll), - 8: syscalls.Supported("lseek", Lseek), - 9: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), - 10: syscalls.Supported("mprotect", Mprotect), - 11: syscalls.Supported("munmap", Munmap), - 12: syscalls.Supported("brk", Brk), - 13: syscalls.Supported("rt_sigaction", RtSigaction), - 14: syscalls.Supported("rt_sigprocmask", RtSigprocmask), - 15: syscalls.Supported("rt_sigreturn", RtSigreturn), - 16: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), - 17: syscalls.Supported("pread64", Pread64), - 18: syscalls.Supported("pwrite64", Pwrite64), - 19: syscalls.Supported("readv", Readv), - 20: syscalls.Supported("writev", Writev), - 21: syscalls.Supported("access", Access), - 22: syscalls.Supported("pipe", Pipe), - 23: syscalls.Supported("select", Select), - 24: syscalls.Supported("sched_yield", SchedYield), - 25: syscalls.Supported("mremap", Mremap), - 26: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), - 27: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), - 28: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 29: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), - 30: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), - 31: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), - 32: syscalls.Supported("dup", Dup), - 33: syscalls.Supported("dup2", Dup2), - 34: syscalls.Supported("pause", Pause), - 35: syscalls.Supported("nanosleep", Nanosleep), - 36: syscalls.Supported("getitimer", Getitimer), - 37: syscalls.Supported("alarm", Alarm), - 38: syscalls.Supported("setitimer", Setitimer), - 39: syscalls.Supported("getpid", Getpid), - 40: syscalls.Supported("sendfile", Sendfile), - 41: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), - 42: syscalls.Supported("connect", Connect), - 43: syscalls.Supported("accept", Accept), - 44: syscalls.Supported("sendto", SendTo), - 45: syscalls.Supported("recvfrom", RecvFrom), - 46: syscalls.Supported("sendmsg", SendMsg), - 47: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), - 48: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), - 49: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), - 50: syscalls.Supported("listen", Listen), - 51: syscalls.Supported("getsockname", GetSockName), - 52: syscalls.Supported("getpeername", GetPeerName), - 53: syscalls.Supported("socketpair", SocketPair), - 54: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), - 55: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), - 56: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), - 57: syscalls.Supported("fork", Fork), - 58: syscalls.Supported("vfork", Vfork), - 59: syscalls.Supported("execve", Execve), - 60: syscalls.Supported("exit", Exit), - 61: syscalls.Supported("wait4", Wait4), - 62: syscalls.Supported("kill", Kill), - 63: syscalls.Supported("uname", Uname), - 64: syscalls.Supported("semget", Semget), - 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 67: syscalls.Supported("shmdt", Shmdt), - 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 70: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 71: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 72: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), - 73: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), - 74: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), - 75: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), - 76: syscalls.Supported("truncate", Truncate), - 77: syscalls.Supported("ftruncate", Ftruncate), - 78: syscalls.Supported("getdents", Getdents), - 79: syscalls.Supported("getcwd", Getcwd), - 80: syscalls.Supported("chdir", Chdir), - 81: syscalls.Supported("fchdir", Fchdir), - 82: syscalls.Supported("rename", Rename), - 83: syscalls.Supported("mkdir", Mkdir), - 84: syscalls.Supported("rmdir", Rmdir), - 85: syscalls.Supported("creat", Creat), - 86: syscalls.Supported("link", Link), - 87: syscalls.Supported("unlink", Unlink), - 88: syscalls.Supported("symlink", Symlink), - 89: syscalls.Supported("readlink", Readlink), - 90: syscalls.Supported("chmod", Chmod), - 91: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), - 92: syscalls.Supported("chown", Chown), - 93: syscalls.Supported("fchown", Fchown), - 94: syscalls.Supported("lchown", Lchown), - 95: syscalls.Supported("umask", Umask), - 96: syscalls.Supported("gettimeofday", Gettimeofday), - 97: syscalls.Supported("getrlimit", Getrlimit), - 98: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), - 99: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 100: syscalls.Supported("times", Times), - 101: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), - 102: syscalls.Supported("getuid", Getuid), - 103: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), - 104: syscalls.Supported("getgid", Getgid), - 105: syscalls.Supported("setuid", Setuid), - 106: syscalls.Supported("setgid", Setgid), - 107: syscalls.Supported("geteuid", Geteuid), - 108: syscalls.Supported("getegid", Getegid), - 109: syscalls.Supported("setpgid", Setpgid), - 110: syscalls.Supported("getppid", Getppid), - 111: syscalls.Supported("getpgrp", Getpgrp), - 112: syscalls.Supported("setsid", Setsid), - 113: syscalls.Supported("setreuid", Setreuid), - 114: syscalls.Supported("setregid", Setregid), - 115: syscalls.Supported("getgroups", Getgroups), - 116: syscalls.Supported("setgroups", Setgroups), - 117: syscalls.Supported("setresuid", Setresuid), - 118: syscalls.Supported("getresuid", Getresuid), - 119: syscalls.Supported("setresgid", Setresgid), - 120: syscalls.Supported("getresgid", Getresgid), - 121: syscalls.Supported("getpgid", Getpgid), - 122: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 123: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 124: syscalls.Supported("getsid", Getsid), - 125: syscalls.Supported("capget", Capget), - 126: syscalls.Supported("capset", Capset), - 127: syscalls.Supported("rt_sigpending", RtSigpending), - 128: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), - 129: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), - 130: syscalls.Supported("rt_sigsuspend", RtSigsuspend), - 131: syscalls.Supported("sigaltstack", Sigaltstack), - 132: syscalls.Supported("utime", Utime), - 133: syscalls.PartiallySupported("mknod", Mknod, "Device creation is not generally supported. Only regular file and FIFO creation are supported.", nil), - 134: syscalls.Error("uselib", syserror.ENOSYS, "Obsolete", nil), - 135: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), - 136: syscalls.ErrorWithEvent("ustat", syserror.ENOSYS, "Needs filesystem support.", nil), - 137: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), - 138: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 139: syscalls.ErrorWithEvent("sysfs", syserror.ENOSYS, "", []string{"gvisor.dev/issue/165"}), - 140: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), - 141: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), - 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 143: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), - 144: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), - 145: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), - 146: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), - 147: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), - 149: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 150: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 151: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 152: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 154: syscalls.Error("modify_ldt", syserror.EPERM, "", nil), - 155: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 156: syscalls.Error("sysctl", syserror.EPERM, "Deprecated. Use /proc/sys instead.", nil), - 157: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), - 158: syscalls.PartiallySupported("arch_prctl", ArchPrctl, "Options ARCH_GET_GS, ARCH_SET_GS not supported.", nil), - 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 160: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), - 161: syscalls.Supported("chroot", Chroot), - 162: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), - 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), - 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 165: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 166: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), - 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), - 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), - 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 170: syscalls.Supported("sethostname", Sethostname), - 171: syscalls.Supported("setdomainname", Setdomainname), - 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil), - 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil), - 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil), - 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), - 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 177: syscalls.Error("get_kernel_syms", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 178: syscalls.Error("query_module", syserror.ENOSYS, "Not supported in Linux > 2.6.", nil), - 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 180: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 181: syscalls.Error("getpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 182: syscalls.Error("putpmsg", syserror.ENOSYS, "Not implemented in Linux.", nil), - 183: syscalls.Error("afs_syscall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 184: syscalls.Error("tuxcall", syserror.ENOSYS, "Not implemented in Linux.", nil), - 185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil), - 186: syscalls.Supported("gettid", Gettid), - 187: syscalls.Supported("readahead", Readahead), - 188: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 189: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), - 190: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), - 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), - 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), - 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), - 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), - 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), - 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), - 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), - 200: syscalls.Supported("tkill", Tkill), - 201: syscalls.Supported("time", Time), - 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), - 203: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), - 204: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 205: syscalls.Error("set_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 206: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 207: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 208: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 209: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 210: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 211: syscalls.Error("get_thread_area", syserror.ENOSYS, "Expected to return ENOSYS on 64-bit", nil), - 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 213: syscalls.Supported("epoll_create", EpollCreate), - 214: syscalls.ErrorWithEvent("epoll_ctl_old", syserror.ENOSYS, "Deprecated.", nil), - 215: syscalls.ErrorWithEvent("epoll_wait_old", syserror.ENOSYS, "Deprecated.", nil), - 216: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), - 217: syscalls.Supported("getdents64", Getdents64), - 218: syscalls.Supported("set_tid_address", SetTidAddress), - 219: syscalls.Supported("restart_syscall", RestartSyscall), - 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), - 221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), - 222: syscalls.Supported("timer_create", TimerCreate), - 223: syscalls.Supported("timer_settime", TimerSettime), - 224: syscalls.Supported("timer_gettime", TimerGettime), - 225: syscalls.Supported("timer_getoverrun", TimerGetoverrun), - 226: syscalls.Supported("timer_delete", TimerDelete), - 227: syscalls.Supported("clock_settime", ClockSettime), - 228: syscalls.Supported("clock_gettime", ClockGettime), - 229: syscalls.Supported("clock_getres", ClockGetres), - 230: syscalls.Supported("clock_nanosleep", ClockNanosleep), - 231: syscalls.Supported("exit_group", ExitGroup), - 232: syscalls.Supported("epoll_wait", EpollWait), - 233: syscalls.Supported("epoll_ctl", EpollCtl), - 234: syscalls.Supported("tgkill", Tgkill), - 235: syscalls.Supported("utimes", Utimes), - 236: syscalls.Error("vserver", syserror.ENOSYS, "Not implemented by Linux", nil), - 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 238: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), - 239: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 240: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 241: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 242: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 243: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 244: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 245: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 247: syscalls.Supported("waitid", Waitid), - 248: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 249: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), - 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), - 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), - 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 257: syscalls.Supported("openat", Openat), - 258: syscalls.Supported("mkdirat", Mkdirat), - 259: syscalls.Supported("mknodat", Mknodat), - 260: syscalls.Supported("fchownat", Fchownat), - 261: syscalls.Supported("futimesat", Futimesat), - 262: syscalls.Supported("fstatat", Fstatat), - 263: syscalls.Supported("unlinkat", Unlinkat), - 264: syscalls.Supported("renameat", Renameat), - 265: syscalls.Supported("linkat", Linkat), - 266: syscalls.Supported("symlinkat", Symlinkat), - 267: syscalls.Supported("readlinkat", Readlinkat), - 268: syscalls.Supported("fchmodat", Fchmodat), - 269: syscalls.Supported("faccessat", Faccessat), - 270: syscalls.Supported("pselect", Pselect), - 271: syscalls.Supported("ppoll", Ppoll), - 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 275: syscalls.Supported("splice", Splice), - 276: syscalls.Supported("tee", Tee), - 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 278: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 280: syscalls.Supported("utimensat", Utimensat), - 281: syscalls.Supported("epoll_pwait", EpollPwait), - 282: syscalls.PartiallySupported("signalfd", Signalfd, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 283: syscalls.Supported("timerfd_create", TimerfdCreate), - 284: syscalls.Supported("eventfd", Eventfd), - 285: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), - 286: syscalls.Supported("timerfd_settime", TimerfdSettime), - 287: syscalls.Supported("timerfd_gettime", TimerfdGettime), - 288: syscalls.Supported("accept4", Accept4), - 289: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 290: syscalls.Supported("eventfd2", Eventfd2), - 291: syscalls.Supported("epoll_create1", EpollCreate1), - 292: syscalls.Supported("dup3", Dup3), - 293: syscalls.Supported("pipe2", Pipe2), - 294: syscalls.Supported("inotify_init1", InotifyInit1), - 295: syscalls.Supported("preadv", Preadv), - 296: syscalls.Supported("pwritev", Pwritev), - 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 298: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), - 299: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 300: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 301: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 302: syscalls.Supported("prlimit64", Prlimit64), - 303: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 304: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 306: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 307: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 308: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 309: syscalls.Supported("getcpu", Getcpu), - 310: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 311: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), - 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 314: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 315: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 316: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 317: syscalls.Supported("seccomp", Seccomp), - 318: syscalls.Supported("getrandom", GetRandom), - 319: syscalls.Supported("memfd_create", MemfdCreate), - 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil), - 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 322: syscalls.Supported("execveat", Execveat), - 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) - 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - - // Syscalls implemented after 325 are "backports" from versions - // of Linux after 4.4. - 326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), - 327: syscalls.Supported("preadv2", Preadv2), - 328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), - 332: syscalls.Supported("statx", Statx), - 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), - 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), - - // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), - }, - - Emulate: map[usermem.Addr]uintptr{ - 0xffffffffff600000: 96, // vsyscall gettimeofday(2) - 0xffffffffff600400: 201, // vsyscall time(2) - 0xffffffffff600800: 309, // vsyscall getcpu(2) - }, - Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS - }, -} diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go deleted file mode 100644 index 7421619de..000000000 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package linux - -import ( - "gvisor.dev/gvisor/pkg/abi" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// ARM64 is a table of Linux arm64 syscall API with the corresponding syscall -// numbers from Linux 4.4. -var ARM64 = &kernel.SyscallTable{ - OS: abi.Linux, - Arch: arch.ARM64, - Version: kernel.Version{ - Sysname: LinuxSysname, - Release: LinuxRelease, - Version: LinuxVersion, - }, - AuditNumber: linux.AUDIT_ARCH_AARCH64, - Table: map[uintptr]kernel.Syscall{ - 0: syscalls.PartiallySupported("io_setup", IoSetup, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 1: syscalls.PartiallySupported("io_destroy", IoDestroy, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}), - 5: syscalls.PartiallySupported("setxattr", SetXattr, "Only supported for tmpfs.", nil), - 6: syscalls.PartiallySupported("lsetxattr", LSetXattr, "Only supported for tmpfs.", nil), - 7: syscalls.PartiallySupported("fsetxattr", FSetXattr, "Only supported for tmpfs.", nil), - 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), - 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), - 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), - 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), - 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), - 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), - 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), - 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), - 17: syscalls.Supported("getcwd", Getcwd), - 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), - 19: syscalls.Supported("eventfd2", Eventfd2), - 20: syscalls.Supported("epoll_create1", EpollCreate1), - 21: syscalls.Supported("epoll_ctl", EpollCtl), - 22: syscalls.Supported("epoll_pwait", EpollPwait), - 23: syscalls.Supported("dup", Dup), - 24: syscalls.Supported("dup3", Dup3), - 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), - 26: syscalls.Supported("inotify_init1", InotifyInit1), - 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), - 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), - 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 32: syscalls.PartiallySupported("flock", Flock, "Locks are held within the sandbox only.", nil), - 33: syscalls.Supported("mknodat", Mknodat), - 34: syscalls.Supported("mkdirat", Mkdirat), - 35: syscalls.Supported("unlinkat", Unlinkat), - 36: syscalls.Supported("symlinkat", Symlinkat), - 37: syscalls.Supported("linkat", Linkat), - 38: syscalls.Supported("renameat", Renameat), - 39: syscalls.PartiallySupported("umount2", Umount2, "Not all options or file systems are supported.", nil), - 40: syscalls.PartiallySupported("mount", Mount, "Not all options or file systems are supported.", nil), - 41: syscalls.Error("pivot_root", syserror.EPERM, "", nil), - 42: syscalls.Error("nfsservctl", syserror.ENOSYS, "Removed after Linux 3.1.", nil), - 43: syscalls.PartiallySupported("statfs", Statfs, "Depends on the backing file system implementation.", nil), - 44: syscalls.PartiallySupported("fstatfs", Fstatfs, "Depends on the backing file system implementation.", nil), - 45: syscalls.Supported("truncate", Truncate), - 46: syscalls.Supported("ftruncate", Ftruncate), - 47: syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil), - 48: syscalls.Supported("faccessat", Faccessat), - 49: syscalls.Supported("chdir", Chdir), - 50: syscalls.Supported("fchdir", Fchdir), - 51: syscalls.Supported("chroot", Chroot), - 52: syscalls.PartiallySupported("fchmod", Fchmod, "Options S_ISUID and S_ISGID not supported.", nil), - 53: syscalls.Supported("fchmodat", Fchmodat), - 54: syscalls.Supported("fchownat", Fchownat), - 55: syscalls.Supported("fchown", Fchown), - 56: syscalls.Supported("openat", Openat), - 57: syscalls.Supported("close", Close), - 58: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil), - 59: syscalls.Supported("pipe2", Pipe2), - 60: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations - 61: syscalls.Supported("getdents64", Getdents64), - 62: syscalls.Supported("lseek", Lseek), - 63: syscalls.Supported("read", Read), - 64: syscalls.Supported("write", Write), - 65: syscalls.Supported("readv", Readv), - 66: syscalls.Supported("writev", Writev), - 67: syscalls.Supported("pread64", Pread64), - 68: syscalls.Supported("pwrite64", Pwrite64), - 69: syscalls.Supported("preadv", Preadv), - 70: syscalls.Supported("pwritev", Pwritev), - 71: syscalls.Supported("sendfile", Sendfile), - 72: syscalls.Supported("pselect", Pselect), - 73: syscalls.Supported("ppoll", Ppoll), - 74: syscalls.PartiallySupported("signalfd4", Signalfd4, "Semantics are slightly different.", []string{"gvisor.dev/issue/139"}), - 75: syscalls.ErrorWithEvent("vmsplice", syserror.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 76: syscalls.PartiallySupported("splice", Splice, "Stub implementation.", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098) - 77: syscalls.Supported("tee", Tee), - 78: syscalls.Supported("readlinkat", Readlinkat), - 79: syscalls.Supported("fstatat", Fstatat), - 80: syscalls.Supported("fstat", Fstat), - 81: syscalls.PartiallySupported("sync", Sync, "Full data flush is not guaranteed at this time.", nil), - 82: syscalls.PartiallySupported("fsync", Fsync, "Full data flush is not guaranteed at this time.", nil), - 83: syscalls.PartiallySupported("fdatasync", Fdatasync, "Full data flush is not guaranteed at this time.", nil), - 84: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), - 85: syscalls.Supported("timerfd_create", TimerfdCreate), - 86: syscalls.Supported("timerfd_settime", TimerfdSettime), - 87: syscalls.Supported("timerfd_gettime", TimerfdGettime), - 88: syscalls.Supported("utimensat", Utimensat), - 89: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil), - 90: syscalls.Supported("capget", Capget), - 91: syscalls.Supported("capset", Capset), - 92: syscalls.ErrorWithEvent("personality", syserror.EINVAL, "Unable to change personality.", nil), - 93: syscalls.Supported("exit", Exit), - 94: syscalls.Supported("exit_group", ExitGroup), - 95: syscalls.Supported("waitid", Waitid), - 96: syscalls.Supported("set_tid_address", SetTidAddress), - 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), - 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 101: syscalls.Supported("nanosleep", Nanosleep), - 102: syscalls.Supported("getitimer", Getitimer), - 103: syscalls.Supported("setitimer", Setitimer), - 104: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil), - 105: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil), - 106: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil), - 107: syscalls.Supported("timer_create", TimerCreate), - 108: syscalls.Supported("timer_gettime", TimerGettime), - 109: syscalls.Supported("timer_getoverrun", TimerGetoverrun), - 110: syscalls.Supported("timer_settime", TimerSettime), - 111: syscalls.Supported("timer_delete", TimerDelete), - 112: syscalls.Supported("clock_settime", ClockSettime), - 113: syscalls.Supported("clock_gettime", ClockGettime), - 114: syscalls.Supported("clock_getres", ClockGetres), - 115: syscalls.Supported("clock_nanosleep", ClockNanosleep), - 116: syscalls.PartiallySupported("syslog", Syslog, "Outputs a dummy message for security reasons.", nil), - 117: syscalls.PartiallySupported("ptrace", Ptrace, "Options PTRACE_PEEKSIGINFO, PTRACE_SECCOMP_GET_FILTER not supported.", nil), - 118: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil), - 119: syscalls.PartiallySupported("sched_setscheduler", SchedSetscheduler, "Stub implementation.", nil), - 120: syscalls.PartiallySupported("sched_getscheduler", SchedGetscheduler, "Stub implementation.", nil), - 121: syscalls.PartiallySupported("sched_getparam", SchedGetparam, "Stub implementation.", nil), - 122: syscalls.PartiallySupported("sched_setaffinity", SchedSetaffinity, "Stub implementation.", nil), - 123: syscalls.PartiallySupported("sched_getaffinity", SchedGetaffinity, "Stub implementation.", nil), - 124: syscalls.Supported("sched_yield", SchedYield), - 125: syscalls.PartiallySupported("sched_get_priority_max", SchedGetPriorityMax, "Stub implementation.", nil), - 126: syscalls.PartiallySupported("sched_get_priority_min", SchedGetPriorityMin, "Stub implementation.", nil), - 127: syscalls.ErrorWithEvent("sched_rr_get_interval", syserror.EPERM, "", nil), - 128: syscalls.Supported("restart_syscall", RestartSyscall), - 129: syscalls.Supported("kill", Kill), - 130: syscalls.Supported("tkill", Tkill), - 131: syscalls.Supported("tgkill", Tgkill), - 132: syscalls.Supported("sigaltstack", Sigaltstack), - 133: syscalls.Supported("rt_sigsuspend", RtSigsuspend), - 134: syscalls.Supported("rt_sigaction", RtSigaction), - 135: syscalls.Supported("rt_sigprocmask", RtSigprocmask), - 136: syscalls.Supported("rt_sigpending", RtSigpending), - 137: syscalls.Supported("rt_sigtimedwait", RtSigtimedwait), - 138: syscalls.Supported("rt_sigqueueinfo", RtSigqueueinfo), - 139: syscalls.Supported("rt_sigreturn", RtSigreturn), - 140: syscalls.PartiallySupported("setpriority", Setpriority, "Stub implementation.", nil), - 141: syscalls.PartiallySupported("getpriority", Getpriority, "Stub implementation.", nil), - 142: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil), - 143: syscalls.Supported("setregid", Setregid), - 144: syscalls.Supported("setgid", Setgid), - 145: syscalls.Supported("setreuid", Setreuid), - 146: syscalls.Supported("setuid", Setuid), - 147: syscalls.Supported("setresuid", Setresuid), - 148: syscalls.Supported("getresuid", Getresuid), - 149: syscalls.Supported("setresgid", Setresgid), - 150: syscalls.Supported("getresgid", Getresgid), - 151: syscalls.ErrorWithEvent("setfsuid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 152: syscalls.ErrorWithEvent("setfsgid", syserror.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702) - 153: syscalls.Supported("times", Times), - 154: syscalls.Supported("setpgid", Setpgid), - 155: syscalls.Supported("getpgid", Getpgid), - 156: syscalls.Supported("getsid", Getsid), - 157: syscalls.Supported("setsid", Setsid), - 158: syscalls.Supported("getgroups", Getgroups), - 159: syscalls.Supported("setgroups", Setgroups), - 160: syscalls.Supported("uname", Uname), - 161: syscalls.Supported("sethostname", Sethostname), - 162: syscalls.Supported("setdomainname", Setdomainname), - 163: syscalls.Supported("getrlimit", Getrlimit), - 164: syscalls.PartiallySupported("setrlimit", Setrlimit, "Not all rlimits are enforced.", nil), - 165: syscalls.PartiallySupported("getrusage", Getrusage, "Fields ru_maxrss, ru_minflt, ru_majflt, ru_inblock, ru_oublock are not supported. Fields ru_utime and ru_stime have low precision.", nil), - 166: syscalls.Supported("umask", Umask), - 167: syscalls.PartiallySupported("prctl", Prctl, "Not all options are supported.", nil), - 168: syscalls.Supported("getcpu", Getcpu), - 169: syscalls.Supported("gettimeofday", Gettimeofday), - 170: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil), - 171: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil), - 172: syscalls.Supported("getpid", Getpid), - 173: syscalls.Supported("getppid", Getppid), - 174: syscalls.Supported("getuid", Getuid), - 175: syscalls.Supported("geteuid", Geteuid), - 176: syscalls.Supported("getgid", Getgid), - 177: syscalls.Supported("getegid", Getegid), - 178: syscalls.Supported("gettid", Gettid), - 179: syscalls.PartiallySupported("sysinfo", Sysinfo, "Fields loads, sharedram, bufferram, totalswap, freeswap, totalhigh, freehigh not supported.", nil), - 180: syscalls.ErrorWithEvent("mq_open", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 181: syscalls.ErrorWithEvent("mq_unlink", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 182: syscalls.ErrorWithEvent("mq_timedsend", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 183: syscalls.ErrorWithEvent("mq_timedreceive", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 184: syscalls.ErrorWithEvent("mq_notify", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 185: syscalls.ErrorWithEvent("mq_getsetattr", syserror.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921) - 186: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 187: syscalls.ErrorWithEvent("msgctl", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) - 190: syscalls.Supported("semget", Semget), - 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), - 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), - 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), - 195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil), - 196: syscalls.PartiallySupported("shmat", Shmat, "Option SHM_RND is not supported.", nil), - 197: syscalls.Supported("shmdt", Shmdt), - 198: syscalls.PartiallySupported("socket", Socket, "Limited support for AF_NETLINK, NETLINK_ROUTE sockets. Limited support for SOCK_RAW.", nil), - 199: syscalls.Supported("socketpair", SocketPair), - 200: syscalls.PartiallySupported("bind", Bind, "Autobind for abstract Unix sockets is not supported.", nil), - 201: syscalls.Supported("listen", Listen), - 202: syscalls.Supported("accept", Accept), - 203: syscalls.Supported("connect", Connect), - 204: syscalls.Supported("getsockname", GetSockName), - 205: syscalls.Supported("getpeername", GetPeerName), - 206: syscalls.Supported("sendto", SendTo), - 207: syscalls.Supported("recvfrom", RecvFrom), - 208: syscalls.PartiallySupported("setsockopt", SetSockOpt, "Not all socket options are supported.", nil), - 209: syscalls.PartiallySupported("getsockopt", GetSockOpt, "Not all socket options are supported.", nil), - 210: syscalls.PartiallySupported("shutdown", Shutdown, "Not all flags and control messages are supported.", nil), - 211: syscalls.Supported("sendmsg", SendMsg), - 212: syscalls.PartiallySupported("recvmsg", RecvMsg, "Not all flags and control messages are supported.", nil), - 213: syscalls.Supported("readahead", Readahead), - 214: syscalls.Supported("brk", Brk), - 215: syscalls.Supported("munmap", Munmap), - 216: syscalls.Supported("mremap", Mremap), - 217: syscalls.Error("add_key", syserror.EACCES, "Not available to user.", nil), - 218: syscalls.Error("request_key", syserror.EACCES, "Not available to user.", nil), - 219: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), - 220: syscalls.PartiallySupported("clone", Clone, "Mount namespace (CLONE_NEWNS) not supported. Options CLONE_PARENT, CLONE_SYSVSEM not supported.", nil), - 221: syscalls.Supported("execve", Execve), - 222: syscalls.PartiallySupported("mmap", Mmap, "Generally supported with exceptions. Options MAP_FIXED_NOREPLACE, MAP_SHARED_VALIDATE, MAP_SYNC MAP_GROWSDOWN, MAP_HUGETLB are not supported.", nil), - 223: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil), - 224: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil), - 225: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil), - 226: syscalls.Supported("mprotect", Mprotect), - 227: syscalls.PartiallySupported("msync", Msync, "Full data flush is not guaranteed at this time.", nil), - 228: syscalls.PartiallySupported("mlock", Mlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 229: syscalls.PartiallySupported("munlock", Munlock, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 230: syscalls.PartiallySupported("mlockall", Mlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 231: syscalls.PartiallySupported("munlockall", Munlockall, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - 232: syscalls.PartiallySupported("mincore", Mincore, "Stub implementation. The sandbox does not have access to this information. Reports all mapped pages are resident.", nil), - 233: syscalls.PartiallySupported("madvise", Madvise, "Options MADV_DONTNEED, MADV_DONTFORK are supported. Other advice is ignored.", nil), - 234: syscalls.ErrorWithEvent("remap_file_pages", syserror.ENOSYS, "Deprecated since Linux 3.16.", nil), - 235: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}), - 236: syscalls.PartiallySupported("get_mempolicy", GetMempolicy, "Stub implementation.", nil), - 237: syscalls.PartiallySupported("set_mempolicy", SetMempolicy, "Stub implementation.", nil), - 238: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), - 239: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly) - 240: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), - 241: syscalls.ErrorWithEvent("perf_event_open", syserror.ENODEV, "No support for perf counters", nil), - 242: syscalls.Supported("accept4", Accept4), - 243: syscalls.PartiallySupported("recvmmsg", RecvMMsg, "Not all flags and control messages are supported.", nil), - 260: syscalls.Supported("wait4", Wait4), - 261: syscalls.Supported("prlimit64", Prlimit64), - 262: syscalls.ErrorWithEvent("fanotify_init", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 263: syscalls.ErrorWithEvent("fanotify_mark", syserror.ENOSYS, "Needs CONFIG_FANOTIFY", nil), - 264: syscalls.Error("name_to_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 265: syscalls.Error("open_by_handle_at", syserror.EOPNOTSUPP, "Not supported by gVisor filesystems", nil), - 266: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil), - 267: syscalls.PartiallySupported("syncfs", Syncfs, "Depends on backing file system.", nil), - 268: syscalls.ErrorWithEvent("setns", syserror.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995) - 269: syscalls.PartiallySupported("sendmmsg", SendMMsg, "Not all flags and control messages are supported.", nil), - 270: syscalls.ErrorWithEvent("process_vm_readv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 271: syscalls.ErrorWithEvent("process_vm_writev", syserror.ENOSYS, "", []string{"gvisor.dev/issue/158"}), - 272: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil), - 273: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil), - 274: syscalls.ErrorWithEvent("sched_setattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 275: syscalls.ErrorWithEvent("sched_getattr", syserror.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272) - 276: syscalls.ErrorWithEvent("renameat2", syserror.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772) - 277: syscalls.Supported("seccomp", Seccomp), - 278: syscalls.Supported("getrandom", GetRandom), - 279: syscalls.Supported("memfd_create", MemfdCreate), - 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), - 281: syscalls.Supported("execveat", Execveat), - 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) - 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), - - // Syscalls after 284 are "backports" from versions of Linux after 4.4. - 285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil), - 286: syscalls.Supported("preadv2", Preadv2), - 287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil), - 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil), - 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil), - 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil), - 291: syscalls.Supported("statx", Statx), - 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil), - 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil), - - // Linux skips ahead to syscall 424 to sync numbers between arches. - 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil), - 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil), - 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil), - 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil), - 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil), - 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil), - 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil), - 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil), - }, - Emulate: map[usermem.Addr]uintptr{}, - - Missing: func(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { - t.Kernel().EmitUnimplementedEvent(t) - return 0, syserror.ENOSYS - }, -} diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 8a34c4e99..ed3413ca6 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -15,6 +15,7 @@ package linux import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -22,32 +23,24 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -const ( - // EFD_SEMAPHORE is a flag used in syscall eventfd(2) and eventfd2(2). Please - // see its man page for more information. - EFD_SEMAPHORE = 1 - EFD_NONBLOCK = 0x800 - EFD_CLOEXEC = 0x80000 -) - // Eventfd2 implements linux syscall eventfd2(2). func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { initVal := args[0].Int() flags := uint(args[1].Uint()) - allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC) + allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { return 0, nil, syserror.EINVAL } - event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0) + event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0) event.SetFlags(fs.SettableFileFlags{ - NonBlocking: flags&EFD_NONBLOCK != 0, + NonBlocking: flags&linux.EFD_NONBLOCK != 0, }) defer event.DecRef() fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ - CloseOnExec: flags&EFD_CLOEXEC != 0, + CloseOnExec: flags&linux.EFD_CLOEXEC != 0, }) if err != nil { return 0, nil, err diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go index a65b560c8..297de052a 100644 --- a/pkg/sentry/syscalls/linux/sys_sysinfo.go +++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go @@ -29,13 +29,18 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca mf.UpdateUsage() _, totalUsage := usage.MemoryAccounting.Copy() totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // Only a subset of the fields in sysinfo_t make sense to return. si := linux.Sysinfo{ Procs: uint16(len(t.PIDNamespace().Tasks())), Uptime: t.Kernel().MonotonicClock().Now().Seconds(), TotalRAM: totalSize, - FreeRAM: totalSize - totalUsage, + FreeRAM: memFree, Unit: 1, } _, err := t.CopyOut(addr, si) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index f6fb0f219..4c7b8f819 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -6,15 +6,13 @@ go_library( name = "vfs2", srcs = [ "epoll.go", + "eventfd.go", "execve.go", "fd.go", "filesystem.go", "fscontext.go", "getdents.go", "ioctl.go", - "linux64.go", - "linux64_override_amd64.go", - "linux64_override_arm64.go", "mmap.go", "path.go", "pipe.go", @@ -26,7 +24,8 @@ go_library( "stat_amd64.go", "stat_arm64.go", "sync.go", - "sys_timerfd.go", + "timerfd.go", + "vfs2.go", "xattr.go", ], marshal = True, diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go new file mode 100644 index 000000000..bd2194972 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -0,0 +1,59 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Eventfd2 implements linux syscall eventfd2(2). +func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + initVal := uint64(args[0].Uint()) + flags := uint(args[1].Uint()) + allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) + + if flags & ^allOps != 0 { + return 0, nil, syserror.EINVAL + } + + fileFlags := uint32(linux.O_RDWR) + if flags&linux.EFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + semMode := flags&linux.EFD_SEMAPHORE != 0 + eventfd, err := t.Kernel().VFS().NewEventFD(initVal, semMode, fileFlags) + if err != nil { + return 0, nil, err + } + defer eventfd.DecRef() + + fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ + CloseOnExec: flags&linux.EFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// Eventfd implements linux syscall eventfd(2). +func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[1].Value = 0 + return Eventfd2(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64.go b/pkg/sentry/syscalls/linux/vfs2/linux64.go deleted file mode 100644 index 19ee36081..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/linux64.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package vfs2 provides syscall implementations that use VFS2. -package vfs2 diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go deleted file mode 100644 index 74920f785..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" -) - -// Override syscall table to add syscalls implementations from this package. -func Override(table map[uintptr]kernel.Syscall) { - table[0] = syscalls.Supported("read", Read) - table[1] = syscalls.Supported("write", Write) - table[2] = syscalls.Supported("open", Open) - table[3] = syscalls.Supported("close", Close) - table[4] = syscalls.Supported("stat", Stat) - table[5] = syscalls.Supported("fstat", Fstat) - table[6] = syscalls.Supported("lstat", Lstat) - table[7] = syscalls.Supported("poll", Poll) - table[8] = syscalls.Supported("lseek", Lseek) - table[9] = syscalls.Supported("mmap", Mmap) - table[16] = syscalls.Supported("ioctl", Ioctl) - table[17] = syscalls.Supported("pread64", Pread64) - table[18] = syscalls.Supported("pwrite64", Pwrite64) - table[19] = syscalls.Supported("readv", Readv) - table[20] = syscalls.Supported("writev", Writev) - table[21] = syscalls.Supported("access", Access) - table[22] = syscalls.Supported("pipe", Pipe) - table[23] = syscalls.Supported("select", Select) - table[32] = syscalls.Supported("dup", Dup) - table[33] = syscalls.Supported("dup2", Dup2) - delete(table, 40) // sendfile - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[41] = syscalls.PartiallySupported("socket", Socket, "In process of porting socket syscalls to VFS2.", nil) - table[42] = syscalls.PartiallySupported("connect", Connect, "In process of porting socket syscalls to VFS2.", nil) - table[43] = syscalls.PartiallySupported("accept", Accept, "In process of porting socket syscalls to VFS2.", nil) - table[44] = syscalls.PartiallySupported("sendto", SendTo, "In process of porting socket syscalls to VFS2.", nil) - table[45] = syscalls.PartiallySupported("recvfrom", RecvFrom, "In process of porting socket syscalls to VFS2.", nil) - table[46] = syscalls.PartiallySupported("sendmsg", SendMsg, "In process of porting socket syscalls to VFS2.", nil) - table[47] = syscalls.PartiallySupported("recvmsg", RecvMsg, "In process of porting socket syscalls to VFS2.", nil) - table[48] = syscalls.PartiallySupported("shutdown", Shutdown, "In process of porting socket syscalls to VFS2.", nil) - table[49] = syscalls.PartiallySupported("bind", Bind, "In process of porting socket syscalls to VFS2.", nil) - table[50] = syscalls.PartiallySupported("listen", Listen, "In process of porting socket syscalls to VFS2.", nil) - table[51] = syscalls.PartiallySupported("getsockname", GetSockName, "In process of porting socket syscalls to VFS2.", nil) - table[52] = syscalls.PartiallySupported("getpeername", GetPeerName, "In process of porting socket syscalls to VFS2.", nil) - table[53] = syscalls.PartiallySupported("socketpair", SocketPair, "In process of porting socket syscalls to VFS2.", nil) - table[54] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil) - table[55] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil) - table[59] = syscalls.Supported("execve", Execve) - table[72] = syscalls.Supported("fcntl", Fcntl) - delete(table, 73) // flock - table[74] = syscalls.Supported("fsync", Fsync) - table[75] = syscalls.Supported("fdatasync", Fdatasync) - table[76] = syscalls.Supported("truncate", Truncate) - table[77] = syscalls.Supported("ftruncate", Ftruncate) - table[78] = syscalls.Supported("getdents", Getdents) - table[79] = syscalls.Supported("getcwd", Getcwd) - table[80] = syscalls.Supported("chdir", Chdir) - table[81] = syscalls.Supported("fchdir", Fchdir) - table[82] = syscalls.Supported("rename", Rename) - table[83] = syscalls.Supported("mkdir", Mkdir) - table[84] = syscalls.Supported("rmdir", Rmdir) - table[85] = syscalls.Supported("creat", Creat) - table[86] = syscalls.Supported("link", Link) - table[87] = syscalls.Supported("unlink", Unlink) - table[88] = syscalls.Supported("symlink", Symlink) - table[89] = syscalls.Supported("readlink", Readlink) - table[90] = syscalls.Supported("chmod", Chmod) - table[91] = syscalls.Supported("fchmod", Fchmod) - table[92] = syscalls.Supported("chown", Chown) - table[93] = syscalls.Supported("fchown", Fchown) - table[94] = syscalls.Supported("lchown", Lchown) - table[132] = syscalls.Supported("utime", Utime) - table[133] = syscalls.Supported("mknod", Mknod) - table[137] = syscalls.Supported("statfs", Statfs) - table[138] = syscalls.Supported("fstatfs", Fstatfs) - table[161] = syscalls.Supported("chroot", Chroot) - table[162] = syscalls.Supported("sync", Sync) - delete(table, 165) // mount - delete(table, 166) // umount2 - delete(table, 187) // readahead - table[188] = syscalls.Supported("setxattr", Setxattr) - table[189] = syscalls.Supported("lsetxattr", Lsetxattr) - table[190] = syscalls.Supported("fsetxattr", Fsetxattr) - table[191] = syscalls.Supported("getxattr", Getxattr) - table[192] = syscalls.Supported("lgetxattr", Lgetxattr) - table[193] = syscalls.Supported("fgetxattr", Fgetxattr) - table[194] = syscalls.Supported("listxattr", Listxattr) - table[195] = syscalls.Supported("llistxattr", Llistxattr) - table[196] = syscalls.Supported("flistxattr", Flistxattr) - table[197] = syscalls.Supported("removexattr", Removexattr) - table[198] = syscalls.Supported("lremovexattr", Lremovexattr) - table[199] = syscalls.Supported("fremovexattr", Fremovexattr) - delete(table, 206) // io_setup - delete(table, 207) // io_destroy - delete(table, 208) // io_getevents - delete(table, 209) // io_submit - delete(table, 210) // io_cancel - table[213] = syscalls.Supported("epoll_create", EpollCreate) - table[217] = syscalls.Supported("getdents64", Getdents64) - delete(table, 221) // fdavise64 - table[232] = syscalls.Supported("epoll_wait", EpollWait) - table[233] = syscalls.Supported("epoll_ctl", EpollCtl) - table[235] = syscalls.Supported("utimes", Utimes) - delete(table, 253) // inotify_init - delete(table, 254) // inotify_add_watch - delete(table, 255) // inotify_rm_watch - table[257] = syscalls.Supported("openat", Openat) - table[258] = syscalls.Supported("mkdirat", Mkdirat) - table[259] = syscalls.Supported("mknodat", Mknodat) - table[260] = syscalls.Supported("fchownat", Fchownat) - table[261] = syscalls.Supported("futimens", Futimens) - table[262] = syscalls.Supported("newfstatat", Newfstatat) - table[263] = syscalls.Supported("unlinkat", Unlinkat) - table[264] = syscalls.Supported("renameat", Renameat) - table[265] = syscalls.Supported("linkat", Linkat) - table[266] = syscalls.Supported("symlinkat", Symlinkat) - table[267] = syscalls.Supported("readlinkat", Readlinkat) - table[268] = syscalls.Supported("fchmodat", Fchmodat) - table[269] = syscalls.Supported("faccessat", Faccessat) - table[270] = syscalls.Supported("pselect", Pselect) - table[271] = syscalls.Supported("ppoll", Ppoll) - delete(table, 275) // splice - delete(table, 276) // tee - table[277] = syscalls.Supported("sync_file_range", SyncFileRange) - table[280] = syscalls.Supported("utimensat", Utimensat) - table[281] = syscalls.Supported("epoll_pwait", EpollPwait) - delete(table, 282) // signalfd - table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) - delete(table, 284) // eventfd - delete(table, 285) // fallocate - table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) - table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[288] = syscalls.PartiallySupported("accept4", Accept4, "In process of porting socket syscalls to VFS2.", nil) - delete(table, 289) // signalfd4 - delete(table, 290) // eventfd2 - table[291] = syscalls.Supported("epoll_create1", EpollCreate1) - table[292] = syscalls.Supported("dup3", Dup3) - table[293] = syscalls.Supported("pipe2", Pipe2) - delete(table, 294) // inotify_init1 - table[295] = syscalls.Supported("preadv", Preadv) - table[296] = syscalls.Supported("pwritev", Pwritev) - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[299] = syscalls.PartiallySupported("recvmmsg", RecvMMsg, "In process of porting socket syscalls to VFS2.", nil) - table[306] = syscalls.Supported("syncfs", Syncfs) - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[307] = syscalls.PartiallySupported("sendmmsg", SendMMsg, "In process of porting socket syscalls to VFS2.", nil) - table[316] = syscalls.Supported("renameat2", Renameat2) - delete(table, 319) // memfd_create - table[322] = syscalls.Supported("execveat", Execveat) - table[327] = syscalls.Supported("preadv2", Preadv2) - table[328] = syscalls.Supported("pwritev2", Pwritev2) - table[332] = syscalls.Supported("statx", Statx) -} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go deleted file mode 100644 index a6b367468..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls" -) - -// Override syscall table to add syscalls implementations from this package. -func Override(table map[uintptr]kernel.Syscall) { - table[63] = syscalls.Supported("read", Read) -} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 6c6998f45..3a7ef24f5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -15,9 +15,13 @@ package vfs2 import ( + "time" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/socket" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -88,7 +92,12 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) - if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + if err != syserror.ErrWouldBlock { + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { return n, err } @@ -108,7 +117,12 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt if err != syserror.ErrWouldBlock { break } - if err := t.Block(ch); err != nil { + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } break } } @@ -233,7 +247,12 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) - if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + if err != syserror.ErrWouldBlock { + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { return n, err } @@ -253,7 +272,12 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of if err != syserror.ErrWouldBlock { break } - if err := t.Block(ch); err != nil { + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } break } } @@ -320,7 +344,12 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) - if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + if err != syserror.ErrWouldBlock { + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { return n, err } @@ -340,7 +369,12 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op if err != syserror.ErrWouldBlock { break } - if err := t.Block(ch); err != nil { + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } break } } @@ -465,7 +499,12 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) - if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + if err != syserror.ErrWouldBlock { + return n, err + } + + allowBlock, deadline, hasDeadline := blockPolicy(t, file) + if !allowBlock { return n, err } @@ -485,7 +524,12 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o if err != syserror.ErrWouldBlock { break } - if err := t.Block(ch); err != nil { + + // Wait for a notification that we should retry. + if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = syserror.ErrWouldBlock + } break } } @@ -494,6 +538,23 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o return total, err } +func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) { + if file.StatusFlags()&linux.O_NONBLOCK != 0 { + return false, ktime.Time{}, false + } + // Sockets support read/write timeouts. + if s, ok := file.Impl().(socket.SocketVFS2); ok { + dl := s.RecvTimeout() + if dl < 0 { + return false, ktime.Time{}, false + } + if dl > 0 { + return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true + } + } + return true, ktime.Time{}, false +} + // Lseek implements Linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 7938a5249..839a07db1 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -46,7 +46,10 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel default: return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewTimerFD(clock, fileFlags) + // Timerfds aren't writable per se (their implementation of Write just + // returns EINVAL), but they are "opened for writing", which is necessary + // to actually reach said implementation of Write. + file, err := t.Kernel().VFS().NewTimerFD(clock, linux.O_RDWR|fileFlags) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go new file mode 100644 index 000000000..f1b697844 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -0,0 +1,172 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs2 provides syscall implementations that use VFS2. +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/syscalls" + "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +// Override syscall table to add syscalls implementations from this package. +func Override() { + // Override AMD64. + s := linux.AMD64 + s.Table[0] = syscalls.Supported("read", Read) + s.Table[1] = syscalls.Supported("write", Write) + s.Table[2] = syscalls.Supported("open", Open) + s.Table[3] = syscalls.Supported("close", Close) + s.Table[4] = syscalls.Supported("stat", Stat) + s.Table[5] = syscalls.Supported("fstat", Fstat) + s.Table[6] = syscalls.Supported("lstat", Lstat) + s.Table[7] = syscalls.Supported("poll", Poll) + s.Table[8] = syscalls.Supported("lseek", Lseek) + s.Table[9] = syscalls.Supported("mmap", Mmap) + s.Table[16] = syscalls.Supported("ioctl", Ioctl) + s.Table[17] = syscalls.Supported("pread64", Pread64) + s.Table[18] = syscalls.Supported("pwrite64", Pwrite64) + s.Table[19] = syscalls.Supported("readv", Readv) + s.Table[20] = syscalls.Supported("writev", Writev) + s.Table[21] = syscalls.Supported("access", Access) + s.Table[22] = syscalls.Supported("pipe", Pipe) + s.Table[23] = syscalls.Supported("select", Select) + s.Table[32] = syscalls.Supported("dup", Dup) + s.Table[33] = syscalls.Supported("dup2", Dup2) + delete(s.Table, 40) // sendfile + s.Table[41] = syscalls.Supported("socket", Socket) + s.Table[42] = syscalls.Supported("connect", Connect) + s.Table[43] = syscalls.Supported("accept", Accept) + s.Table[44] = syscalls.Supported("sendto", SendTo) + s.Table[45] = syscalls.Supported("recvfrom", RecvFrom) + s.Table[46] = syscalls.Supported("sendmsg", SendMsg) + s.Table[47] = syscalls.Supported("recvmsg", RecvMsg) + s.Table[48] = syscalls.Supported("shutdown", Shutdown) + s.Table[49] = syscalls.Supported("bind", Bind) + s.Table[50] = syscalls.Supported("listen", Listen) + s.Table[51] = syscalls.Supported("getsockname", GetSockName) + s.Table[52] = syscalls.Supported("getpeername", GetPeerName) + s.Table[53] = syscalls.Supported("socketpair", SocketPair) + s.Table[54] = syscalls.Supported("setsockopt", SetSockOpt) + s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) + s.Table[59] = syscalls.Supported("execve", Execve) + s.Table[72] = syscalls.Supported("fcntl", Fcntl) + delete(s.Table, 73) // flock + s.Table[74] = syscalls.Supported("fsync", Fsync) + s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) + s.Table[76] = syscalls.Supported("truncate", Truncate) + s.Table[77] = syscalls.Supported("ftruncate", Ftruncate) + s.Table[78] = syscalls.Supported("getdents", Getdents) + s.Table[79] = syscalls.Supported("getcwd", Getcwd) + s.Table[80] = syscalls.Supported("chdir", Chdir) + s.Table[81] = syscalls.Supported("fchdir", Fchdir) + s.Table[82] = syscalls.Supported("rename", Rename) + s.Table[83] = syscalls.Supported("mkdir", Mkdir) + s.Table[84] = syscalls.Supported("rmdir", Rmdir) + s.Table[85] = syscalls.Supported("creat", Creat) + s.Table[86] = syscalls.Supported("link", Link) + s.Table[87] = syscalls.Supported("unlink", Unlink) + s.Table[88] = syscalls.Supported("symlink", Symlink) + s.Table[89] = syscalls.Supported("readlink", Readlink) + s.Table[90] = syscalls.Supported("chmod", Chmod) + s.Table[91] = syscalls.Supported("fchmod", Fchmod) + s.Table[92] = syscalls.Supported("chown", Chown) + s.Table[93] = syscalls.Supported("fchown", Fchown) + s.Table[94] = syscalls.Supported("lchown", Lchown) + s.Table[132] = syscalls.Supported("utime", Utime) + s.Table[133] = syscalls.Supported("mknod", Mknod) + s.Table[137] = syscalls.Supported("statfs", Statfs) + s.Table[138] = syscalls.Supported("fstatfs", Fstatfs) + s.Table[161] = syscalls.Supported("chroot", Chroot) + s.Table[162] = syscalls.Supported("sync", Sync) + delete(s.Table, 165) // mount + delete(s.Table, 166) // umount2 + delete(s.Table, 187) // readahead + s.Table[188] = syscalls.Supported("setxattr", Setxattr) + s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr) + s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr) + s.Table[191] = syscalls.Supported("getxattr", Getxattr) + s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr) + s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr) + s.Table[194] = syscalls.Supported("listxattr", Listxattr) + s.Table[195] = syscalls.Supported("llistxattr", Llistxattr) + s.Table[196] = syscalls.Supported("flistxattr", Flistxattr) + s.Table[197] = syscalls.Supported("removexattr", Removexattr) + s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) + s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) + delete(s.Table, 206) // io_setup + delete(s.Table, 207) // io_destroy + delete(s.Table, 208) // io_getevents + delete(s.Table, 209) // io_submit + delete(s.Table, 210) // io_cancel + s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) + s.Table[217] = syscalls.Supported("getdents64", Getdents64) + delete(s.Table, 221) // fdavise64 + s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) + s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) + s.Table[235] = syscalls.Supported("utimes", Utimes) + delete(s.Table, 253) // inotify_init + delete(s.Table, 254) // inotify_add_watch + delete(s.Table, 255) // inotify_rm_watch + s.Table[257] = syscalls.Supported("openat", Openat) + s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) + s.Table[259] = syscalls.Supported("mknodat", Mknodat) + s.Table[260] = syscalls.Supported("fchownat", Fchownat) + s.Table[261] = syscalls.Supported("futimens", Futimens) + s.Table[262] = syscalls.Supported("newfstatat", Newfstatat) + s.Table[263] = syscalls.Supported("unlinkat", Unlinkat) + s.Table[264] = syscalls.Supported("renameat", Renameat) + s.Table[265] = syscalls.Supported("linkat", Linkat) + s.Table[266] = syscalls.Supported("symlinkat", Symlinkat) + s.Table[267] = syscalls.Supported("readlinkat", Readlinkat) + s.Table[268] = syscalls.Supported("fchmodat", Fchmodat) + s.Table[269] = syscalls.Supported("faccessat", Faccessat) + s.Table[270] = syscalls.Supported("pselect", Pselect) + s.Table[271] = syscalls.Supported("ppoll", Ppoll) + delete(s.Table, 275) // splice + delete(s.Table, 276) // tee + s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) + s.Table[280] = syscalls.Supported("utimensat", Utimensat) + s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) + delete(s.Table, 282) // signalfd + s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) + s.Table[284] = syscalls.Supported("eventfd", Eventfd) + delete(s.Table, 285) // fallocate + s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) + s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) + s.Table[288] = syscalls.Supported("accept4", Accept4) + delete(s.Table, 289) // signalfd4 + s.Table[290] = syscalls.Supported("eventfd2", Eventfd2) + s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) + s.Table[292] = syscalls.Supported("dup3", Dup3) + s.Table[293] = syscalls.Supported("pipe2", Pipe2) + delete(s.Table, 294) // inotify_init1 + s.Table[295] = syscalls.Supported("preadv", Preadv) + s.Table[296] = syscalls.Supported("pwritev", Pwritev) + s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg) + s.Table[306] = syscalls.Supported("syncfs", Syncfs) + s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) + s.Table[316] = syscalls.Supported("renameat2", Renameat2) + delete(s.Table, 319) // memfd_create + s.Table[322] = syscalls.Supported("execveat", Execveat) + s.Table[327] = syscalls.Supported("preadv2", Preadv2) + s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) + s.Table[332] = syscalls.Supported("statx", Statx) + s.Init() + + // Override ARM64. + s = linux.ARM64 + s.Table[63] = syscalls.Supported("read", Read) + s.Init() +} diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index 4320ad17f..ab1d140d2 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -252,18 +252,23 @@ func (m *MemoryLocked) Copy() (MemoryStats, uint64) { return ms, m.totalLocked() } -// MinimumTotalMemoryBytes is the minimum reported total system memory. -// -// This can be configured through options provided to the Sentry at start. -// This number is purely synthetic. This is only set before the application -// starts executing, and must not be modified. -var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB +// These options control how much total memory the is reported to the application. +// They may only be set before the application starts executing, and must not +// be modified. +var ( + // MinimumTotalMemoryBytes is the minimum reported total system memory. + MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB + + // MaximumTotalMemoryBytes is the maximum reported total system memory. + // The 0 value indicates no maximum. + MaximumTotalMemoryBytes uint64 +) // TotalMemory returns the "total usable memory" available. // // This number doesn't really have a true value so it's based on the following -// inputs and further bounded to be above some minimum guaranteed value (2GB), -// additionally ensuring that total memory reported is always less than used. +// inputs and further bounded to be above the MinumumTotalMemoryBytes and below +// MaximumTotalMemoryBytes. // // memSize should be the platform.Memory size reported by platform.Memory.TotalSize() // used is the total memory reported by MemoryLocked.Total() @@ -279,5 +284,8 @@ func TotalMemory(memSize, used uint64) uint64 { memSize = uint64(1) << (uint(msb) + 1) } } + if MaximumTotalMemoryBytes > 0 && memSize > MaximumTotalMemoryBytes { + memSize = MaximumTotalMemoryBytes + } return memSize } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 9aeb83fb0..c62505fe2 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -25,6 +25,7 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", + "eventfd.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -44,6 +45,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fd", + "//pkg/fdnotifier", "//pkg/fspath", "//pkg/gohacks", "//pkg/log", @@ -68,6 +70,7 @@ go_test( name = "vfs_test", size = "small", srcs = [ + "eventfd_test.go", "file_description_impl_util_test.go", "mount_test.go", ], @@ -79,5 +82,6 @@ go_test( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index a64d86122..981bd8caa 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -237,10 +237,13 @@ func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) { if !rp.Final() { return nil, syserror.ENOTDIR } + if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil { + return nil, err + } return nil, syserror.ECONNREFUSED } diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index 82781e6d3..c9e724fef 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -49,3 +49,27 @@ func RootFromContext(ctx context.Context) VirtualDentry { } return VirtualDentry{} } + +type rootContext struct { + context.Context + root VirtualDentry +} + +// WithRoot returns a copy of ctx with the given root. +func WithRoot(ctx context.Context, root VirtualDentry) context.Context { + return &rootContext{ + Context: ctx, + root: root, + } +} + +// Value implements Context.Value. +func (rc rootContext) Value(key interface{}) interface{} { + switch key { + case CtxRoot: + rc.root.IncRef() + return rc.root + default: + return rc.Context.Value(key) + } +} diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 8e0b40841..8297f964b 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -192,6 +192,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin mask: mask, userData: event.Data, } + epi.waiter.Callback = epi ep.interest[key] = epi wmask := waiter.EventMaskFromLinux(mask) file.EventRegister(&epi.waiter, wmask) diff --git a/pkg/sentry/vfs/eventfd.go b/pkg/sentry/vfs/eventfd.go new file mode 100644 index 000000000..f39dacacf --- /dev/null +++ b/pkg/sentry/vfs/eventfd.go @@ -0,0 +1,282 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventFileDescription implements FileDescriptionImpl for file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in +// certain situations they may be converted into a host-backed eventfd. +type EventFileDescription struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + // queue is used to notify interested parties when the event object + // becomes readable or writable. + queue waiter.Queue `state:"zerovalue"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +var _ FileDescriptionImpl = (*EventFileDescription)(nil) + +// NewEventFD creates a new event fd. +func (vfs *VirtualFilesystem) NewEventFD(initVal uint64, semMode bool, flags uint32) (*FileDescription, error) { + vd := vfs.NewAnonVirtualDentry("[eventfd]") + defer vd.DecRef() + efd := &EventFileDescription{ + val: initVal, + semMode: semMode, + hostfd: -1, + } + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &efd.vfsfd, nil +} + +// HostFD returns the host eventfd associated with this event. +func (efd *EventFileDescription) HostFD() (int, error) { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + return efd.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if efd.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) + if errno != 0 { + return -1, errno + } + + if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { + if closeErr := syscall.Close(int(fd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) + } + return -1, err + } + + efd.hostfd = int(fd) + return efd.hostfd, nil +} + +// Release implements FileDescriptionImpl.Release() +func (efd *EventFileDescription) Release() { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.RemoveFD(int32(efd.hostfd)) + if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) + } + efd.hostfd = -1 + } +} + +// Read implements FileDescriptionImpl.Read. +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ ReadOptions) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements FileDescriptionImpl.Write. +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ WriteOptions) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { + efd.mu.Lock() + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostReadLocked(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if efd.val == 0 { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if efd.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + efd.val-- + } else { + val = efd.val + efd.val = 0 + } + + efd.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + efd.queue.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostWriteLocked(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(efd.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return efd.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (efd *EventFileDescription) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + efd.mu.Lock() + + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostWriteLocked(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-efd.val { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + efd.val += val + efd.mu.Unlock() + + // Always trigger a notification. + efd.queue.Notify(waiter.EventIn) + + return nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + efd.mu.Lock() + defer efd.mu.Unlock() + + if efd.hostfd >= 0 { + return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) + } + + ready := waiter.EventMask(0) + if efd.val > 0 { + ready |= waiter.EventIn + } + + if efd.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + efd.queue.EventRegister(entry, mask) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { + efd.queue.EventUnregister(entry) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} diff --git a/pkg/sentry/vfs/eventfd_test.go b/pkg/sentry/vfs/eventfd_test.go new file mode 100644 index 000000000..2dff2d10b --- /dev/null +++ b/pkg/sentry/vfs/eventfd_test.go @@ -0,0 +1,96 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventFD(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := vfsObj.NewEventFD(initVal, false, linux.O_RDWR) + if err != nil { + t.Fatalf("NewEventFD failed: %v", err) + } + defer eventfd.DecRef() + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + eventfd.EventRegister(&w, waiter.EventIn) + defer eventfd.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), WriteOptions{}) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventFDStat(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := vfsObj.NewEventFD(0, false, linux.O_RDWR) + if err != nil { + t.Fatalf("NewEventFD failed: %v", err) + } + defer eventfd.DecRef() + + statx, err := eventfd.Stat(ctx, StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + t.Fatalf("eventfd.Stat failed: %v", err) + } + if statx.Size != 0 { + t.Errorf("eventfd size should be 0") + } +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 418d69b96..cfabd936c 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -91,10 +91,6 @@ type FileDescriptionOptions struct { // ESPIPE. DenyPWrite bool - // if InvalidWrite is true, calls to FileDescription.Write() return - // EINVAL. - InvalidWrite bool - // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling @@ -570,9 +566,6 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - if fd.opts.InvalidWrite { - return 0, syserror.EINVAL - } if !fd.writable { return 0, syserror.EBADF } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 20e5bb072..1edd584c9 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -494,8 +494,14 @@ type FilesystemImpl interface { // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. // - // - If a non-socket file exists at rp, then BoundEndpointAt returns ECONNREFUSED. - BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) + // Errors: + // + // - If the file does not have write permissions, then BoundEndpointAt + // returns EACCES. + // + // - If a non-socket file exists at rp, then BoundEndpointAt returns + // ECONNREFUSED. + BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) // PrependPath prepends a path from vd to vd.Mount().Root() to b. // diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 022bac127..53d364c5c 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -151,6 +151,24 @@ type SetStatOptions struct { Stat linux.Statx } +// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() +// and FilesystemImpl.BoundEndpointAt(). +type BoundEndpointOptions struct { + // Addr is the path of the file whose socket endpoint is being retrieved. + // It is generally irrelevant: most endpoints are stored at a dentry that + // was created through a bind syscall, so the path can be stored on creation. + // However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(), + // then we may not know what the original bind address was. + // + // For example, if connect(2) is called with address "foo" which corresponds + // a remote named socket in goferfs, we need to generate an endpoint wrapping + // that file. In this case, we can use Addr to set the endpoint address to + // "foo". Note that Addr is only a best-effort attempt--we still do not know + // the exact address that was used on the remote fs to bind the socket (it + // may have been "foo", "./foo", etc.). + Addr string +} + // GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(), // FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and // FileDescriptionImpl.Getxattr(). diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go index 42b880656..cc536ceaf 100644 --- a/pkg/sentry/vfs/timerfd.go +++ b/pkg/sentry/vfs/timerfd.go @@ -53,7 +53,6 @@ func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*File UseDentryMetadata: true, DenyPRead: true, DenyPWrite: true, - InvalidWrite: true, }); err != nil { return nil, err } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 9015f2cc1..8d7f8f8af 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -351,33 +351,6 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia } } -// BoundEndpointAt gets the bound endpoint at the given path, if one exists. -func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (transport.BoundEndpoint, error) { - if !pop.Path.Begin.Ok() { - if pop.Path.Absolute { - return nil, syserror.ECONNREFUSED - } - return nil, syserror.ENOENT - } - rp := vfs.getResolvingPath(creds, pop) - for { - bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp) - if err == nil { - vfs.putResolvingPath(rp) - return bep, nil - } - if checkInvariants { - if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) - } - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return nil, err - } - } -} - // OpenAt returns a FileDescription providing access to the file at the given // path. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { @@ -675,6 +648,33 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti } } +// BoundEndpointAt gets the bound endpoint at the given path, if one exists. +func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return nil, syserror.ECONNREFUSED + } + return nil, syserror.ENOENT + } + rp := vfs.getResolvingPath(creds, pop) + for { + bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return bep, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + // ListxattrAt returns all extended attribute names for the file at the given // path. func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { |