diff options
Diffstat (limited to 'pkg/sentry')
48 files changed, 1647 insertions, 256 deletions
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index e74275d2d..0c9a62f0d 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -23,6 +23,7 @@ go_library( "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/host", + "//pkg/sentry/fs/user", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/host", "//pkg/sentry/kernel", diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 2ed17ee09..8767430b7 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -18,7 +18,6 @@ import ( "bytes" "encoding/json" "fmt" - "path" "sort" "strings" "text/tabwriter" @@ -28,10 +27,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsbridge" hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -190,17 +189,12 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // transferred to the new process. initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() } - - paths := fs.GetPath(initArgs.Envv) - vfsObj := proc.Kernel.VFS() - file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + file, err := getExecutableFD(ctx, creds, proc.Kernel.VFS(), initArgs.MountNamespaceVFS2, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0]) if err != nil { - return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in environment %v: %v", initArgs.Argv[0], initArgs.Envv, err) } initArgs.File = fsbridge.NewVFSFile(file) } else { - // Get the full path to the filename from the PATH env variable. - paths := fs.GetPath(initArgs.Envv) if initArgs.MountNamespace == nil { // Set initArgs so that 'ctx' returns the namespace. initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() @@ -209,9 +203,9 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } - f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + f, err := user.ResolveExecutablePath(ctx, creds, initArgs.MountNamespace, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0]) if err != nil { - return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], initArgs.Envv, err) } initArgs.Filename = f } @@ -429,53 +423,17 @@ func ttyName(tty *kernel.TTY) string { return fmt.Sprintf("pts/%d", tty.Index) } -// ResolveExecutablePath resolves the given executable name given a set of -// paths that might contain it. -func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) { - root := vfs.RootFromContext(ctx) - defer root.DecRef() - creds := auth.CredentialsFromContext(ctx) - - // Absolute paths can be used directly. - if path.IsAbs(name) { - return openExecutable(ctx, vfsObj, creds, root, name) - } - - // Paths with '/' in them should be joined to the working directory, or - // to the root if working directory is not set. - if strings.IndexByte(name, '/') > 0 { - if len(wd) == 0 { - wd = "/" - } - if !path.IsAbs(wd) { - return nil, fmt.Errorf("working directory %q must be absolute", wd) - } - return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name)) +// getExecutableFD resolves the given executable name and returns a +// vfs.FileDescription for the executable file. +func getExecutableFD(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, mns *vfs.MountNamespace, envv []string, wd, name string) (*vfs.FileDescription, error) { + path, err := user.ResolveExecutablePathVFS2(ctx, creds, mns, envv, wd, name) + if err != nil { + return nil, err } - // Otherwise, we must lookup the name in the paths, starting from the - // calling context's root directory. - for _, p := range paths { - if !path.IsAbs(p) { - // Relative paths aren't safe, no one should be using them. - log.Warningf("Skipping relative path %q in $PATH", p) - continue - } - - binPath := path.Join(p, name) - f, err := openExecutable(ctx, vfsObj, creds, root, binPath) - if err != nil { - return nil, err - } - if f == nil { - continue // Not found/no access. - } - return f, nil - } - return nil, syserror.ENOENT -} + root := vfs.RootFromContext(ctx) + defer root.DecRef() -func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) { pop := vfs.PathOperation{ Root: root, Start: root, // binPath is absolute, Start can be anything. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 846252c89..2a278fbe3 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error if !f.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. if f.Flags().Append { @@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // offset." unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) defer unlockAppendMu() - if f.Flags().Append { if err := f.offsetForAppend(ctx, &offset); err != nil { return 0, err @@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 return f.FileOperations.Write(ctx, f, src, offset) } -// offsetForAppend sets the given offset to the end of the file. +// offsetForAppend atomically sets the given offset to the end of the file. // -// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing. +// Precondition: the file.Dirent.Inode.appendMu mutex should be held for +// writing. func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { uattr, err := f.Dirent.Inode.UnstableAttr(ctx) if err != nil { @@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { } // Update the offset. - *offset = uattr.Size + atomic.StoreInt64(offset, uattr.Size) return nil } diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index bdba6efe5..d2dbff268 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -42,9 +42,10 @@ // Dirent.dirMu // Dirent.mu // DirentCache.mu -// Locks in InodeOperations implementations or overlayEntry // Inode.Watches.mu (see `Inotify` for other lock ordering) // MountSource.mu +// Inode.appendMu +// Locks in InodeOperations implementations or overlayEntry // // If multiple Dirent or MountSource locks must be taken, locks in the parent must be // taken before locks in their children. diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md index 635cc009b..2ca84dd74 100644 --- a/pkg/sentry/fs/g3doc/fuse.md +++ b/pkg/sentry/fs/g3doc/fuse.md @@ -76,7 +76,8 @@ ops can be implemented in parallel. #### Minimal client that can mount a trivial FUSE filesystem. -- Implement `/dev/fuse`. +- Implement `/dev/fuse` - a character device used to establish an FD for + communication between the sentry and the server daemon. - Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. @@ -99,7 +100,7 @@ ops can be implemented in parallel. ## FUSE Protocol The FUSE protocol is a request-response protocol. All requests are initiated by -the client. The wire-format for the protocol is raw c structs serialized to +the client. The wire-format for the protocol is raw C structs serialized to memory. All FUSE requests begin with the following request header: @@ -255,6 +256,8 @@ I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`. # References -- `fuse(4)` manpage. -- Linux kernel FUSE documentation: - https://www.kernel.org/doc/html/latest/filesystems/fuse.html +- [fuse(4) Linux manual page](https://www.man7.org/linux/man-pages/man4/fuse.4.html) +- [Linux kernel FUSE documentation](https://www.kernel.org/doc/html/latest/filesystems/fuse.html) +- [The reference implementation of the Linux FUSE (Filesystem in Userspace) + interface](https://github.com/libfuse/libfuse) +- [The kernel interface of FUSE](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fuse.h) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index b414ddaee..3f2bd0e87 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -17,13 +17,9 @@ package fs import ( "fmt" "math" - "path" - "strings" "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -625,71 +621,3 @@ func (mns *MountNamespace) SyncAll(ctx context.Context) { defer mns.mu.Unlock() mns.root.SyncAll(ctx) } - -// ResolveExecutablePath resolves the given executable name given a set of -// paths that might contain it. -func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name string, paths []string) (string, error) { - // Absolute paths can be used directly. - if path.IsAbs(name) { - return name, nil - } - - // Paths with '/' in them should be joined to the working directory, or - // to the root if working directory is not set. - if strings.IndexByte(name, '/') > 0 { - if wd == "" { - wd = "/" - } - if !path.IsAbs(wd) { - return "", fmt.Errorf("working directory %q must be absolute", wd) - } - return path.Join(wd, name), nil - } - - // Otherwise, We must lookup the name in the paths, starting from the - // calling context's root directory. - root := RootFromContext(ctx) - if root == nil { - // Caller has no root. Don't bother traversing anything. - return "", syserror.ENOENT - } - defer root.DecRef() - for _, p := range paths { - binPath := path.Join(p, name) - traversals := uint(linux.MaxSymlinkTraversals) - d, err := mns.FindInode(ctx, root, nil, binPath, &traversals) - if err == syserror.ENOENT || err == syserror.EACCES { - // Didn't find it here. - continue - } - if err != nil { - return "", err - } - defer d.DecRef() - - // Check that it is a regular file. - if !IsRegular(d.Inode.StableAttr) { - continue - } - - // Check whether we can read and execute the found file. - if err := d.Inode.CheckPermission(ctx, PermMask{Read: true, Execute: true}); err != nil { - log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) - continue - } - return path.Join("/", p, name), nil - } - return "", syserror.ENOENT -} - -// GetPath returns the PATH as a slice of strings given the environment -// variables. -func GetPath(env []string) []string { - const prefix = "PATH=" - for _, e := range env { - if strings.HasPrefix(e, prefix) { - return strings.Split(strings.TrimPrefix(e, prefix), ":") - } - } - return nil -} diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD index f37f979f1..bd5dac373 100644 --- a/pkg/sentry/fs/user/BUILD +++ b/pkg/sentry/fs/user/BUILD @@ -4,15 +4,20 @@ package(licenses = ["notice"]) go_library( name = "user", - srcs = ["user.go"], + srcs = [ + "path.go", + "user.go", + ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/log", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", + "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go new file mode 100644 index 000000000..fbd4547a7 --- /dev/null +++ b/pkg/sentry/fs/user/path.go @@ -0,0 +1,169 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package user + +import ( + "fmt" + "path" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ResolveExecutablePath resolves the given executable name given the working +// dir and environment. +func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs.MountNamespace, envv []string, wd, name string) (string, error) { + // Absolute paths can be used directly. + if path.IsAbs(name) { + return name, nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if wd == "" { + wd = "/" + } + if !path.IsAbs(wd) { + return "", fmt.Errorf("working directory %q must be absolute", wd) + } + return path.Join(wd, name), nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // calling context's root directory. + paths := getPath(envv) + + root := fs.RootFromContext(ctx) + if root == nil { + // Caller has no root. Don't bother traversing anything. + return "", syserror.ENOENT + } + defer root.DecRef() + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + traversals := uint(linux.MaxSymlinkTraversals) + d, err := mns.FindInode(ctx, root, nil, binPath, &traversals) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return "", err + } + defer d.DecRef() + + // Check that it is a regular file. + if !fs.IsRegular(d.Inode.StableAttr) { + continue + } + + // Check whether we can read and execute the found file. + if err := d.Inode.CheckPermission(ctx, fs.PermMask{Read: true, Execute: true}); err != nil { + log.Infof("Found executable at %q, but user cannot execute it: %v", binPath, err) + continue + } + return path.Join("/", p, name), nil + } + + // Couldn't find it. + return "", syserror.ENOENT +} + +// ResolveExecutablePathVFS2 resolves the given executable name given the +// working dir and environment. +func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, envv []string, wd, name string) (string, error) { + // Absolute paths can be used directly. + if path.IsAbs(name) { + return name, nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if wd == "" { + wd = "/" + } + if !path.IsAbs(wd) { + return "", fmt.Errorf("working directory %q must be absolute", wd) + } + return path.Join(wd, name), nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // calling context's root directory. + paths := getPath(envv) + + root := mns.Root() + defer root.DecRef() + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(binPath), + FollowFinalSymlink: true, + } + opts := &vfs.OpenOptions{ + FileExec: true, + Flags: linux.O_RDONLY, + } + dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return "", err + } + dentry.DecRef() + + return binPath, nil + } + + // Couldn't find it. + return "", syserror.ENOENT +} + +// getPath returns the PATH as a slice of strings given the environment +// variables. +func getPath(env []string) []string { + const prefix = "PATH=" + for _, e := range env { + if strings.HasPrefix(e, prefix) { + return strings.Split(strings.TrimPrefix(e, prefix), ":") + } + } + return nil +} diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go index fe7f67c00..f4d525523 100644 --- a/pkg/sentry/fs/user/user.go +++ b/pkg/sentry/fs/user/user.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package user contains methods for resolving filesystem paths based on the +// user and their environment. package user import ( diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index bfbd7c3d4..6bd1a9fc6 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -60,3 +60,15 @@ func (d *dentry) DecRef() { // inode.decRef(). d.inode.decRef() } + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 67e916525..f5f35a3bc 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -35,6 +35,7 @@ go_library( "fstree.go", "gofer.go", "handle.go", + "host_named_pipe.go", "p9file.go", "regular_file.go", "socket.go", @@ -47,6 +48,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fd", + "//pkg/fdnotifier", "//pkg/fspath", "//pkg/log", "//pkg/p9", @@ -71,6 +73,7 @@ go_library( "//pkg/unet", "//pkg/usermem", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 7f2181216..36e0e1856 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -760,7 +760,7 @@ afterTrailingSymlink: parent.dirMu.Unlock() return nil, syserror.EPERM } - fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) parent.dirMu.Unlock() return fd, err } @@ -873,19 +873,37 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) + // We assume that the server silently inserts O_NONBLOCK in the open flags + // for all named pipes (because all existing gofers do this). + // + // NOTE(b/133875563): This makes named pipe opens racy, because the + // mechanisms for translating nonblocking to blocking opens can only detect + // the instantaneous presence of a peer holding the other end of the pipe + // open, not whether the pipe was *previously* opened by a peer that has + // since closed its end. + isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 +retry: + h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) if err != nil { + if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO { + // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails + // with ENXIO if opening the same named pipe with O_WRONLY would + // block because there are no readers of the pipe. + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return nil, err + } + goto retry + } return nil, err } - seekable := d.fileType() == linux.S_IFREG - fd := &specialFileFD{ - handle: h, - seekable: seekable, + if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { + if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil { + h.close(ctx) + return nil, err + } } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - DenyPRead: !seekable, - DenyPWrite: !seekable, - }); err != nil { + fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) + if err != nil { h.close(ctx) return nil, err } @@ -894,7 +912,7 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. // !d.isSynthetic(). -func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } @@ -947,6 +965,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } + *ds = appendDentry(*ds, child) // Incorporate the fid that was opened by lcreate. useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { @@ -959,10 +978,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) child.handleMu.Unlock() } - // Take a reference on the new dentry to be held by the new file - // description. (This reference also means that the new dentry is not - // eligible for caching yet, so we don't need to append to a dentry slice.) - child.refs = 1 // Insert the dentry into the tree. d.cacheNewChildLocked(child, name) if d.cachedMetadataAuthoritative() { @@ -981,22 +996,16 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } else { - seekable := child.fileType() == linux.S_IFREG - fd := &specialFileFD{ - handle: handle{ - file: openFile, - fd: -1, - }, - seekable: seekable, + h := handle{ + file: openFile, + fd: -1, } if fdobj != nil { - fd.handle.fd = int32(fdobj.Release()) + h.fd = int32(fdobj.Release()) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ - DenyPRead: !seekable, - DenyPWrite: !seekable, - }); err != nil { - fd.handle.close(ctx) + fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) + if err != nil { + h.close(ctx) return nil, err } childVFSFD = &fd.vfsfd diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 131da332f..3f3bd56f0 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1039,6 +1039,18 @@ func (d *dentry) decRefLocked() { } } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go new file mode 100644 index 000000000..7294de7d6 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -0,0 +1,97 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "sync" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create +// pipes after sentry initialization due to syscall filters. +var ( + tempPipeMu sync.Mutex + tempPipeReadFD int + tempPipeWriteFD int + tempPipeBuf [1]byte +) + +func init() { + var pipeFDs [2]int + if err := unix.Pipe(pipeFDs[:]); err != nil { + panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err)) + } + tempPipeReadFD = pipeFDs[0] + tempPipeWriteFD = pipeFDs[1] +} + +func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error { + for { + ok, err := nonblockingPipeHasWriter(fd) + if err != nil { + return err + } + if ok { + return nil + } + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return err + } + } +} + +func nonblockingPipeHasWriter(fd int32) (bool, error) { + tempPipeMu.Lock() + defer tempPipeMu.Unlock() + // Copy 1 byte from fd into the temporary pipe. + n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK) + if err == syserror.EAGAIN { + // The pipe represented by fd is empty, but has a writer. + return true, nil + } + if err != nil { + return false, err + } + if n == 0 { + // The pipe represented by fd is empty and has no writer. + return false, nil + } + // The pipe represented by fd is non-empty, so it either has, or has + // previously had, a writer. Remove the byte copied to the temporary pipe + // before returning. + if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 { + panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err)) + } + return true, nil +} + +func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { + t := time.NewTimer(100 * time.Millisecond) + defer t.Stop() + cancel := ctx.SleepStart() + select { + case <-t.C: + ctx.SleepFinish(true) + return nil + case <-cancel: + ctx.SleepFinish(false) + return syserror.ErrInterrupted + } +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a464e6a94..ff6126b87 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -19,17 +19,18 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) -// specialFileFD implements vfs.FileDescriptionImpl for files other than -// regular files, directories, and symlinks: pipes, sockets, etc. It is also -// used for regular files when filesystemOptions.specialRegularFiles is in -// effect. specialFileFD differs from regularFileFD by using per-FD handles -// instead of shared per-dentry handles, and never buffering I/O. +// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device +// special files, and (when filesystemOptions.specialRegularFiles is in effect) +// regular files. specialFileFD differs from regularFileFD by using per-FD +// handles instead of shared per-dentry handles, and never buffering I/O. type specialFileFD struct { fileDescription @@ -40,13 +41,47 @@ type specialFileFD struct { // file offset is significant, i.e. a regular file. seekable is immutable. seekable bool + // mayBlock is true if this file description represents a file for which + // queue may send I/O readiness events. mayBlock is immutable. + mayBlock bool + queue waiter.Queue + // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex off int64 } +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { + ftype := d.fileType() + seekable := ftype == linux.S_IFREG + mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK + fd := &specialFileFD{ + handle: h, + seekable: seekable, + mayBlock: mayBlock, + } + if mayBlock && h.fd >= 0 { + if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { + return nil, err + } + } + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { + if mayBlock && h.fd >= 0 { + fdnotifier.RemoveFD(h.fd) + } + return nil, err + } + return fd, nil +} + // Release implements vfs.FileDescriptionImpl.Release. func (fd *specialFileFD) Release() { + if fd.mayBlock && fd.handle.fd >= 0 { + fdnotifier.RemoveFD(fd.handle.fd) + } fd.handle.close(context.Background()) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() @@ -62,6 +97,32 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { return fd.handle.file.flush(ctx) } +// Readiness implements waiter.Waitable.Readiness. +func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { + if fd.mayBlock { + return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) + } + return fd.fileDescription.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + if fd.mayBlock { + fd.queue.EventRegister(e, mask) + return + } + fd.fileDescription.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { + if fd.mayBlock { + fd.queue.EventUnregister(e) + return + } + fd.fileDescription.EventUnregister(e) +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if fd.seekable && offset < 0 { @@ -81,6 +142,9 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } if n == 0 { return 0, err } @@ -130,6 +194,9 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } return int64(n), err } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index a83151ad3..bbee8ccda 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -225,9 +225,21 @@ func (d *Dentry) destroy() { } } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) Watches() *vfs.Watches { + return nil +} + // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on -// it's own isn't sufficient to insert a child into a directory. InsertChild +// its own isn't sufficient to insert a child into a directory. InsertChild // updates the link count on d if required. // // Precondition: d must represent a directory inode. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 007be1572..062321cbc 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -59,6 +59,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index f2399981b..70387cb9c 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -79,6 +79,7 @@ func (dir *directory) removeChildLocked(child *dentry) { dir.iterMu.Lock() dir.childList.Remove(child) dir.iterMu.Unlock() + child.unlinked = true } type directoryFD struct { @@ -112,6 +113,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba dir.iterMu.Lock() defer dir.iterMu.Unlock() + fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 80fa7b29d..183eb975c 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if err := create(parentDir, name); err != nil { return err } + + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) parentDir.inode.touchCMtime() return nil } @@ -241,6 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() + d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) @@ -354,6 +361,7 @@ afterTrailingSymlink: if err != nil { return nil, err } + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) parentDir.inode.touchCMtime() return fd, nil } @@ -559,6 +567,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParentDir.inode.touchCMtime() } renamed.inode.touchCtime() + + vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) return nil } @@ -603,8 +613,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) - parentDir.inode.decLinksLocked() // from child's ".." + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) + // Remove links for child, child/., and child/.. child.inode.decLinksLocked() + child.inode.decLinksLocked() + parentDir.inode.decLinksLocked() vfsObj.CommitDeleteDentry(&child.vfsd) parentDir.inode.touchCMtime() return nil @@ -618,7 +631,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if err != nil { return err } - return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat) + if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -698,6 +718,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } + + // Generate inotify events. Note that this must take place before the link + // count of the child is decremented, or else the watches may be dropped + // before these events are added. + vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + parentDir.removeChildLocked(child) child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(&child.vfsd) @@ -754,7 +780,12 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.inode.setxattr(rp.Credentials(), &opts) + if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { + return err + } + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -765,7 +796,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.inode.removexattr(rp.Credentials(), name) + if err := d.inode.removexattr(rp.Credentials(), name); err != nil { + return err + } + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 1e781aecd..f0e098702 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -163,6 +163,11 @@ type dentry struct { // filesystem.mu. name string + // unlinked indicates whether this dentry has been unlinked from its parent. + // It is only set to true on an unlink operation, and never set from true to + // false. unlinked is protected by filesystem.mu. + unlinked bool + // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry @@ -201,6 +206,26 @@ func (d *dentry) DecRef() { d.inode.decRef() } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { + if d.inode.isDir() { + events |= linux.IN_ISDIR + } + + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + // Note that d.parent or d.name may be stale if there is a concurrent + // rename operation. Inotify does not provide consistency guarantees. + d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked) + } + d.inode.watches.Notify("", events, cookie, et) +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + return &d.inode.watches +} + // inode represents a filesystem object. type inode struct { // fs is the owning filesystem. fs is immutable. @@ -209,11 +234,9 @@ type inode struct { // refs is a reference count. refs is accessed using atomic memory // operations. // - // A reference is held on all inodes that are reachable in the filesystem - // tree. For non-directories (which may have multiple hard links), this - // means that a reference is dropped when nlink reaches 0. For directories, - // nlink never reaches 0 due to the "." entry; instead, - // filesystem.RmdirAt() drops the reference. + // A reference is held on all inodes as long as they are reachable in the + // filesystem tree, i.e. nlink is nonzero. This reference is dropped when + // nlink reaches 0. refs int64 // xattrs implements extended attributes. @@ -238,6 +261,9 @@ type inode struct { // Advisory file locks, which lock at the inode level. locks lock.FileLocks + // Inotify watches for this inode. + watches vfs.Watches + impl interface{} // immutable } @@ -259,6 +285,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.ctime = now i.mtime = now // i.nlink initialized by caller + i.watches = vfs.Watches{} i.impl = impl } @@ -276,14 +303,17 @@ func (i *inode) incLinksLocked() { atomic.AddUint32(&i.nlink, 1) } -// decLinksLocked decrements i's link count. +// decLinksLocked decrements i's link count. If the link count reaches 0, we +// remove a reference on i as well. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. func (i *inode) decLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } - atomic.AddUint32(&i.nlink, ^uint32(0)) + if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { + i.decRef() + } } func (i *inode) incRef() { @@ -306,6 +336,7 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + i.watches.HandleDeletion() if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any @@ -627,8 +658,12 @@ func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + func (fd *fileDescription) inode() *inode { - return fd.vfsfd.Dentry().Impl().(*dentry).inode + return fd.dentry().inode } // Stat implements vfs.FileDescriptionImpl.Stat. @@ -641,7 +676,15 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return fd.inode().setStat(ctx, creds, &opts.Stat) + d := fd.dentry() + if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. @@ -656,12 +699,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) + d := fd.dentry() + if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) + d := fd.dentry() + if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // NewMemfd creates a new tmpfs regular file and file description that can back diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ed40b5303..dbfcef0fa 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -152,7 +152,13 @@ func (f *FDTable) drop(file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { // TODO(gvisor.dev/issue/1480): Release locks. - // TODO(gvisor.dev/issue/1479): Send inotify events. + + // Generate inotify events. + ev := uint32(linux.IN_CLOSE_NOWRITE) + if file.IsWritable() { + ev = linux.IN_CLOSE_WRITE + } + file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) // Drop the table reference. file.DecRef() diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 9add7c944..2407014e9 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.15 +// +build !go1.16 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index e42505542..750751aa3 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -60,6 +60,12 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { if !vr.accessType.Write && vr.accessType.Read { rdonlyRegions = append(rdonlyRegions, vr.region) } + + // TODO(gvisor.dev/issue/2686): PROT_NONE should be specially treated. + // Workaround: treated as rdonly temporarily. + if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute { + rdonlyRegions = append(rdonlyRegions, vr.region) + } }) for _, r := range rdonlyRegions { diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index f04be2ab5..de7df4f80 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.15 +// +build !go1.16 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index 2ae6b9f9d..0bee995e4 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.15 +// +build !go1.16 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 789bb94c8..66015e2bc 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -64,6 +64,8 @@ const enableLogging = false var emptyFilter = stack.IPHeaderFilter{ Dst: "\x00\x00\x00\x00", DstMask: "\x00\x00\x00\x00", + Src: "\x00\x00\x00\x00", + SrcMask: "\x00\x00\x00\x00", } // nflog logs messages related to the writing and reading of iptables. @@ -142,31 +144,27 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen } func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) { - ipt := stk.IPTables() - table, ok := ipt.Tables[tablename.String()] + table, ok := stk.IPTables().GetTable(tablename.String()) if !ok { return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename) } return table, nil } -// FillDefaultIPTables sets stack's IPTables to the default tables and -// populates them with metadata. -func FillDefaultIPTables(stk *stack.Stack) { - ipt := stack.DefaultTables() - - // In order to fill in the metadata, we have to translate ipt from its - // netstack format to Linux's giant-binary-blob format. - for name, table := range ipt.Tables { - _, metadata, err := convertNetstackToBinary(name, table) - if err != nil { - panic(fmt.Errorf("Unable to set default IP tables: %v", err)) +// FillIPTablesMetadata populates stack's IPTables with metadata. +func FillIPTablesMetadata(stk *stack.Stack) { + stk.IPTables().ModifyTables(func(tables map[string]stack.Table) { + // In order to fill in the metadata, we have to translate ipt from its + // netstack format to Linux's giant-binary-blob format. + for name, table := range tables { + _, metadata, err := convertNetstackToBinary(name, table) + if err != nil { + panic(fmt.Errorf("Unable to set default IP tables: %v", err)) + } + table.SetMetadata(metadata) + tables[name] = table } - table.SetMetadata(metadata) - ipt.Tables[name] = table - } - - stk.SetIPTables(ipt) + }) } // convertNetstackToBinary converts the iptables as stored in netstack to the @@ -214,11 +212,16 @@ func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelI } copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst) copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask) + copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src) + copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask) copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface) copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) if rule.Filter.DstInvert { entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP } + if rule.Filter.SrcInvert { + entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP + } if rule.Filter.OutputInterfaceInvert { entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT } @@ -566,15 +569,13 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // - There are no chains without an unconditional final rule. // - There are no chains without an unconditional underflow rule. - ipt := stk.IPTables() table.SetMetadata(metadata{ HookEntry: replace.HookEntry, Underflow: replace.Underflow, NumEntries: replace.NumEntries, Size: replace.Size, }) - ipt.Tables[replace.Name.String()] = table - stk.SetIPTables(ipt) + stk.IPTables().ReplaceTable(replace.Name.String(), table) return nil } @@ -737,6 +738,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) } + if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) + } n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) if n == -1 { @@ -755,6 +759,9 @@ func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { Dst: tcpip.Address(iptip.Dst[:]), DstMask: tcpip.Address(iptip.DstMask[:]), DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, + Src: tcpip.Address(iptip.Src[:]), + SrcMask: tcpip.Address(iptip.SrcMask[:]), + SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, OutputInterface: ifname, OutputInterfaceMask: ifnameMask, OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, @@ -765,15 +772,13 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool { // The following features are supported: // - Protocol // - Dst and DstMask + // - Src and SrcMask // - The inverse destination IP check flag // - OutputInterface, OutputInterfaceMask and its inverse. - var emptyInetAddr = linux.InetAddr{} var emptyInterface = [linux.IFNAMSIZ]byte{} // Disable any supported inverse flags. - inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_VIA_OUT) - return iptip.Src != emptyInetAddr || - iptip.SrcMask != emptyInetAddr || - iptip.InputInterface != emptyInterface || + inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) + return iptip.InputInterface != emptyInterface || iptip.InputInterfaceMask != emptyInterface || iptip.Flags != 0 || iptip.InverseFlags&^inverseMask != 0 diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 3863293c7..1b4e0ad79 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -111,7 +111,7 @@ func (*OwnerMatcher) Name() string { } // Match implements Matcher.Match. -func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { +func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { // Support only for OUTPUT chain. // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. if hook != stack.Output { diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 57a1e1c12..ebabdf334 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -96,7 +96,7 @@ func (*TCPMatcher) Name() string { } // Match implements Matcher.Match. -func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { +func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { netHeader := header.IPv4(pkt.NetworkHeader) if netHeader.TransportProtocol() != header.TCPProtocolNumber { diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index cfa9e621d..98b9943f8 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -93,7 +93,7 @@ func (*UDPMatcher) Name() string { } // Match implements Matcher.Match. -func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { +func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { netHeader := header.IPv4(pkt.NetworkHeader) // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index f5fa18136..9b44c2b89 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -362,14 +362,13 @@ func (s *Stack) RouteTable() []inet.Route { } // IPTables returns the stack's iptables. -func (s *Stack) IPTables() (stack.IPTables, error) { +func (s *Stack) IPTables() (*stack.IPTables, error) { return s.Stack.IPTables(), nil } -// FillDefaultIPTables sets the stack's iptables to the default tables, which -// allow and do not modify all traffic. -func (s *Stack) FillDefaultIPTables() { - netfilter.FillDefaultIPTables(s.Stack) +// FillIPTablesMetadata populates stack's IPTables with metadata. +func (s *Stack) FillIPTablesMetadata() { + netfilter.FillIPTablesMetadata(s.Stack) } // Resume implements inet.Stack.Resume. diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index ce5b94ee7..09c6d3b27 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -252,7 +252,7 @@ func (e *connectionedEndpoint) Close() { // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { if ce.Type() != e.stype { - return syserr.ErrConnectionRefused + return syserr.ErrWrongProtocolForSocket } // Check if ce is e to avoid a deadlock. diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 5b29e9d7f..c4c9db81b 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -417,7 +417,18 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool defer ep.Release() // Connect the server endpoint. - return s.ep.Connect(t, ep) + err = s.ep.Connect(t, ep) + + if err == syserr.ErrWrongProtocolForSocket { + // Linux for abstract sockets returns ErrConnectionRefused + // instead of ErrWrongProtocolForSocket. + path, _ := extractPath(sockaddr) + if len(path) > 0 && path[0] == 0 { + err = syserr.ErrConnectionRefused + } + } + + return err } // Write implements fs.FileOperations.Write. diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 39f2b79ec..77c78889d 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -80,6 +80,12 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB } } + if total > 0 { + // On Linux, inotify behavior is not very consistent with splice(2). We try + // our best to emulate Linux for very basic calls to splice, where for some + // reason, events are generated for output files, but not input files. + outFile.Dirent.InotifyEvent(linux.IN_MODIFY, 0) + } return total, err } diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 2de5e3422..c24946160 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -207,7 +207,11 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si return syserror.EOPNOTSUPP } - return d.Inode.SetXattr(t, d, name, value, flags) + if err := d.Inode.SetXattr(t, d, name, value, flags); err != nil { + return err + } + d.InotifyEvent(linux.IN_ATTRIB, 0) + return nil } func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { @@ -418,7 +422,11 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error { return syserror.EOPNOTSUPP } - return d.Inode.RemoveXattr(t, d, name) + if err := d.Inode.RemoveXattr(t, d, name); err != nil { + return err + } + d.InotifyEvent(linux.IN_ATTRIB, 0) + return nil } // LINT.ThenChange(vfs2/xattr.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index d56927ff5..9c8b44f64 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -12,6 +12,7 @@ go_library( "filesystem.go", "fscontext.go", "getdents.go", + "inotify.go", "ioctl.go", "memfd.go", "mmap.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go new file mode 100644 index 000000000..7d50b6a16 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -0,0 +1,134 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC + +// InotifyInit1 implements the inotify_init1() syscalls. +func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^allFlags != 0 { + return 0, nil, syserror.EINVAL + } + + ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) + if err != nil { + return 0, nil, err + } + defer ino.DecRef() + + fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ + CloseOnExec: flags&linux.IN_CLOEXEC != 0, + }) + + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// InotifyInit implements the inotify_init() syscalls. +func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[0].Value = 0 + return InotifyInit1(t, args) +} + +// fdToInotify resolves an fd to an inotify object. If successful, the file will +// have an extra ref and the caller is responsible for releasing the ref. +func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) { + f := t.GetFileVFS2(fd) + if f == nil { + // Invalid fd. + return nil, nil, syserror.EBADF + } + + ino, ok := f.Impl().(*vfs.Inotify) + if !ok { + // Not an inotify fd. + f.DecRef() + return nil, nil, syserror.EINVAL + } + + return ino, f, nil +} + +// InotifyAddWatch implements the inotify_add_watch() syscall. +func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + mask := args[2].Uint() + + // "EINVAL: The given event mask contains no valid events." + // -- inotify_add_watch(2) + if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + return 0, nil, syserror.EINVAL + } + + // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." + // -- inotify(7) + follow := followFinalSymlink + if mask&linux.IN_DONT_FOLLOW == 0 { + follow = nofollowFinalSymlink + } + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + if mask&linux.IN_ONLYDIR != 0 { + path.Dir = true + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) + if err != nil { + return 0, nil, err + } + defer d.DecRef() + + fd = ino.AddWatch(d.Dentry(), mask) + return uintptr(fd), nil, err +} + +// InotifyRmWatch implements the inotify_rm_watch() syscall. +func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + wd := args[1].Int() + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + return 0, nil, ino.RmWatch(wd) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 3a7ef24f5..7f9debd4a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return n, err } @@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return total, err } @@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return n, err } @@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return total, err } @@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } return n, err } @@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } return total, err } @@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return n, err } @@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + } return total, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 8f3c22a02..945a364a7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -187,6 +187,11 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if n == 0 { return 0, nil, err } + + // On Linux, inotify behavior is not very consistent with splice(2). We try + // our best to emulate Linux for very basic calls to splice, where for some + // reason, events are generated for output files, but not input files. + outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 083fdcf82..ef8358b8a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -116,9 +116,9 @@ func Override() { s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) s.Table[235] = syscalls.Supported("utimes", Utimes) - delete(s.Table, 253) // inotify_init - delete(s.Table, 254) // inotify_add_watch - delete(s.Table, 255) // inotify_rm_watch + s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil) + s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) s.Table[257] = syscalls.Supported("openat", Openat) s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) s.Table[259] = syscalls.Supported("mknodat", Mknodat) @@ -151,7 +151,7 @@ func Override() { s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) s.Table[292] = syscalls.Supported("dup3", Dup3) s.Table[293] = syscalls.Supported("pipe2", Pipe2) - delete(s.Table, 294) // inotify_init1 + s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil) s.Table[295] = syscalls.Supported("preadv", Preadv) s.Table[296] = syscalls.Supported("pwritev", Pwritev) s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 94d69c1cc..774cc66cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -15,6 +15,18 @@ go_template_instance( }, ) +go_template_instance( + name = "event_list", + out = "event_list.go", + package = "vfs", + prefix = "event", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Event", + "Linker": "*Event", + }, +) + go_library( name = "vfs", srcs = [ @@ -25,11 +37,13 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", + "event_list.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", "filesystem_impl_util.go", "filesystem_type.go", + "inotify.go", "mount.go", "mount_unsafe.go", "options.go", @@ -57,6 +71,7 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 9aa133bcb..66f3105bd 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -39,8 +39,8 @@ Mount references are held by: - Mount: Each referenced Mount holds a reference on its parent, which is the mount containing its mount point. -- VirtualFilesystem: A reference is held on each Mount that has not been - umounted. +- VirtualFilesystem: A reference is held on each Mount that has been connected + to a mount point, but not yet umounted. MountNamespace and FileDescription references are held by users of VFS. The expectation is that each `kernel.Task` holds a reference on its corresponding diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index caf770fd5..b7c6b60b8 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -297,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool { func (d *anonDentry) DecRef() { // no-op } + +// InotifyWithParent implements DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {} + +// Watches implements DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) Watches() *Watches { + return nil +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 8624dbd5d..24af13eb1 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -103,6 +103,22 @@ type DentryImpl interface { // DecRef decrements the Dentry's reference count. DecRef() + + // InotifyWithParent notifies all watches on the targets represented by this + // dentry and its parent. The parent's watches are notified first, followed + // by this dentry's. + // + // InotifyWithParent automatically adds the IN_ISDIR flag for dentries + // representing directories. + // + // Note that the events may not actually propagate up to the user, depending + // on the event masks. + InotifyWithParent(events uint32, cookie uint32, et EventType) + + // Watches returns the set of inotify watches for the file corresponding to + // the Dentry. Dentries that are hard links to the same underlying file + // share the same watches. + Watches() *Watches } // IncRef increments d's reference count. @@ -133,6 +149,17 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } +// InotifyWithParent notifies all watches on the inodes for this dentry and +// its parent of events. +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) { + d.impl.InotifyWithParent(events, cookie, et) +} + +// Watches returns the set of inotify watches associated with d. +func (d *Dentry) Watches() *Watches { + return d.impl.Watches() +} + // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go new file mode 100644 index 000000000..05a3051a4 --- /dev/null +++ b/pkg/sentry/vfs/inotify.go @@ -0,0 +1,697 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// inotifyEventBaseSize is the base size of linux's struct inotify_event. This +// must be a power 2 for rounding below. +const inotifyEventBaseSize = 16 + +// EventType defines different kinds of inotfiy events. +// +// The way events are labelled appears somewhat arbitrary, but they must match +// Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +type EventType uint8 + +// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and +// FSNOTIFY_EVENT_INODE in Linux. +const ( + PathEvent EventType = iota + InodeEvent EventType = iota +) + +// Inotify represents an inotify instance created by inotify_init(2) or +// inotify_init1(2). Inotify implements FileDescriptionImpl. +// +// Lock ordering: +// Inotify.mu -> Watches.mu -> Inotify.evMu +// +// +stateify savable +type Inotify struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + // Unique identifier for this inotify instance. We don't just reuse the + // inotify fd because fds can be duped. These should not be exposed to the + // user, since we may aggressively reuse an id on S/R. + id uint64 + + // queue is used to notify interested parties when the inotify instance + // becomes readable or writable. + queue waiter.Queue `state:"nosave"` + + // evMu *only* protects the events list. We need a separate lock while + // queuing events: using mu may violate lock ordering, since at that point + // the calling goroutine may already hold Watches.mu. + evMu sync.Mutex `state:"nosave"` + + // A list of pending events for this inotify instance. Protected by evMu. + events eventList + + // A scratch buffer, used to serialize inotify events. Allocate this + // ahead of time for the sake of performance. Protected by evMu. + scratch []byte + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // nextWatchMinusOne is used to allocate watch descriptors on this Inotify + // instance. Note that Linux starts numbering watch descriptors from 1. + nextWatchMinusOne int32 + + // Map from watch descriptors to watch objects. + watches map[int32]*Watch +} + +var _ FileDescriptionImpl = (*Inotify)(nil) + +// NewInotifyFD constructs a new Inotify instance. +func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) { + // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. + flags &^= linux.O_CLOEXEC + if flags&^linux.O_NONBLOCK != 0 { + return nil, syserror.EINVAL + } + + id := uniqueid.GlobalFromContext(ctx) + vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) + defer vd.DecRef() + fd := &Inotify{ + id: id, + scratch: make([]byte, inotifyEventBaseSize), + watches: make(map[int32]*Watch), + } + if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. Release removes all +// watches and frees all resources for an inotify instance. +func (i *Inotify) Release() { + // We need to hold i.mu to avoid a race with concurrent calls to + // Inotify.handleDeletion from Watches. There's no risk of Watches + // accessing this Inotify after the destructor ends, because we remove all + // references to it below. + i.mu.Lock() + defer i.mu.Unlock() + for _, w := range i.watches { + // Remove references to the watch from the watches set on the target. We + // don't need to worry about the references from i.watches, since this + // file description is about to be destroyed. + w.set.Remove(i.id) + } +} + +// EventRegister implements waiter.Waitable. +func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + i.queue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable. +func (i *Inotify) EventUnregister(e *waiter.Entry) { + i.queue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// Readiness indicates whether there are pending events for an inotify instance. +func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + i.evMu.Lock() + defer i.evMu.Unlock() + + if !i.events.Empty() { + ready |= waiter.EventIn + } + + return mask & ready +} + +// PRead implements FileDescriptionImpl. +func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// PWrite implements FileDescriptionImpl. +func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements FileDescriptionImpl.Write. +func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements FileDescriptionImpl.Read. +func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if dst.NumBytes() < inotifyEventBaseSize { + return 0, syserror.EINVAL + } + + i.evMu.Lock() + defer i.evMu.Unlock() + + if i.events.Empty() { + // Nothing to read yet, tell caller to block. + return 0, syserror.ErrWouldBlock + } + + var writeLen int64 + for it := i.events.Front(); it != nil; { + // Advance `it` before the element is removed from the list, or else + // it.Next() will always be nil. + event := it + it = it.Next() + + // Does the buffer have enough remaining space to hold the event we're + // about to write out? + if dst.NumBytes() < int64(event.sizeOf()) { + if writeLen > 0 { + // Buffer wasn't big enough for all pending events, but we did + // write some events out. + return writeLen, nil + } + return 0, syserror.EINVAL + } + + // Linux always dequeues an available event as long as there's enough + // buffer space to copy it out, even if the copy below fails. Emulate + // this behaviour. + i.events.Remove(event) + + // Buffer has enough space, copy event to the read buffer. + n, err := event.CopyTo(ctx, i.scratch, dst) + if err != nil { + return 0, err + } + + writeLen += n + dst = dst.DropFirst64(n) + } + return writeLen, nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Int() { + case linux.FIONREAD: + i.evMu.Lock() + defer i.evMu.Unlock() + var n uint32 + for e := i.events.Front(); e != nil; e = e.Next() { + n += uint32(e.sizeOf()) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], n) + _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func (i *Inotify) queueEvent(ev *Event) { + i.evMu.Lock() + + // Check if we should coalesce the event we're about to queue with the last + // one currently in the queue. Events are coalesced if they are identical. + if last := i.events.Back(); last != nil { + if ev.equals(last) { + // "Coalesce" the two events by simply not queuing the new one. We + // don't need to raise a waiter.EventIn notification because no new + // data is available for reading. + i.evMu.Unlock() + return + } + } + + i.events.PushBack(ev) + + // Release mutex before notifying waiters because we don't control what they + // can do. + i.evMu.Unlock() + + i.queue.Notify(waiter.EventIn) +} + +// newWatchLocked creates and adds a new watch to target. +// +// Precondition: i.mu must be locked. +func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch { + targetWatches := target.Watches() + w := &Watch{ + owner: i, + wd: i.nextWatchIDLocked(), + set: targetWatches, + mask: mask, + } + + // Hold the watch in this inotify instance as well as the watch set on the + // target. + i.watches[w.wd] = w + targetWatches.Add(w) + return w +} + +// newWatchIDLocked allocates and returns a new watch descriptor. +// +// Precondition: i.mu must be locked. +func (i *Inotify) nextWatchIDLocked() int32 { + i.nextWatchMinusOne++ + return i.nextWatchMinusOne +} + +// handleDeletion handles the deletion of the target of watch w. It removes w +// from i.watches and a watch removal event is generated. +func (i *Inotify) handleDeletion(w *Watch) { + i.mu.Lock() + _, found := i.watches[w.wd] + delete(i.watches, w.wd) + i.mu.Unlock() + + if found { + i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) + } +} + +// AddWatch constructs a new inotify watch and adds it to the target. It +// returns the watch descriptor returned by inotify_add_watch(2). +func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { + // Note: Locking this inotify instance protects the result returned by + // Lookup() below. With the lock held, we know for sure the lookup result + // won't become stale because it's impossible for *this* instance to + // add/remove watches on target. + i.mu.Lock() + defer i.mu.Unlock() + + // Does the target already have a watch from this inotify instance? + if existing := target.Watches().Lookup(i.id); existing != nil { + newmask := mask + if mask&linux.IN_MASK_ADD != 0 { + // "Add (OR) events to watch mask for this pathname if it already + // exists (instead of replacing mask)." -- inotify(7) + newmask |= atomic.LoadUint32(&existing.mask) + } + atomic.StoreUint32(&existing.mask, newmask) + return existing.wd + } + + // No existing watch, create a new watch. + w := i.newWatchLocked(target, mask) + return w.wd +} + +// RmWatch looks up an inotify watch for the given 'wd' and configures the +// target to stop sending events to this inotify instance. +func (i *Inotify) RmWatch(wd int32) error { + i.mu.Lock() + + // Find the watch we were asked to removed. + w, ok := i.watches[wd] + if !ok { + i.mu.Unlock() + return syserror.EINVAL + } + + // Remove the watch from this instance. + delete(i.watches, wd) + + // Remove the watch from the watch target. + w.set.Remove(w.OwnerID()) + i.mu.Unlock() + + // Generate the event for the removal. + i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) + + return nil +} + +// Watches is the collection of all inotify watches on a single file. +// +// +stateify savable +type Watches struct { + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // ws is the map of active watches in this collection, keyed by the inotify + // instance id of the owner. + ws map[uint64]*Watch +} + +// Lookup returns the watch owned by an inotify instance with the given id. +// Returns nil if no such watch exists. +// +// Precondition: the inotify instance with the given id must be locked to +// prevent the returned watch from being concurrently modified or replaced in +// Inotify.watches. +func (w *Watches) Lookup(id uint64) *Watch { + w.mu.Lock() + defer w.mu.Unlock() + return w.ws[id] +} + +// Add adds watch into this set of watches. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Add(watch *Watch) { + w.mu.Lock() + defer w.mu.Unlock() + + owner := watch.OwnerID() + // Sanity check, we should never have two watches for one owner on the + // same target. + if _, exists := w.ws[owner]; exists { + panic(fmt.Sprintf("Watch collision with ID %+v", owner)) + } + if w.ws == nil { + w.ws = make(map[uint64]*Watch) + } + w.ws[owner] = watch +} + +// Remove removes a watch with the given id from this set of watches and +// releases it. The caller is responsible for generating any watch removal +// event, as appropriate. The provided id must match an existing watch in this +// collection. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Remove(id uint64) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.ws == nil { + // This watch set is being destroyed. The thread executing the + // destructor is already in the process of deleting all our watches. We + // got here with no references on the target because we raced with the + // destructor notifying all the watch owners of destruction. See the + // comment in Watches.HandleDeletion for why this race exists. + return + } + + if _, ok := w.ws[id]; !ok { + // While there's technically no problem with silently ignoring a missing + // watch, this is almost certainly a bug. + panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id)) + } + delete(w.ws, id) +} + +// Notify queues a new event with all watches in this set. +func (w *Watches) Notify(name string, events, cookie uint32, et EventType) { + w.NotifyWithExclusions(name, events, cookie, et, false) +} + +// NotifyWithExclusions queues a new event with watches in this set. Watches +// with IN_EXCL_UNLINK are skipped if the event is coming from a child that +// has been unlinked. +func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) { + // N.B. We don't defer the unlocks because Notify is in the hot path of + // all IO operations, and the defer costs too much for small IO + // operations. + w.mu.RLock() + for _, watch := range w.ws { + if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent { + continue + } + watch.Notify(name, events, cookie) + } + w.mu.RUnlock() +} + +// HandleDeletion is called when the watch target is destroyed to emit +// the appropriate events. +func (w *Watches) HandleDeletion() { + w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent) + + // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied + // by value. Ideally, we wouldn't have this circular locking so we can just + // notify of IN_DELETE_SELF in the same loop below. + // + // We can't hold w.mu while calling watch.handleDeletion to preserve lock + // ordering w.r.t to the owner inotify instances. Instead, atomically move + // the watches map into a local variable so we can iterate over it safely. + // + // Because of this however, it is possible for the watches' owners to reach + // this inode while the inode has no refs. This is still safe because the + // owners can only reach the inode until this function finishes calling + // watch.handleDeletion below and the inode is guaranteed to exist in the + // meantime. But we still have to be very careful not to rely on inode state + // that may have been already destroyed. + var ws map[uint64]*Watch + w.mu.Lock() + ws = w.ws + w.ws = nil + w.mu.Unlock() + + for _, watch := range ws { + // TODO(gvisor.dev/issue/1479): consider refactoring this. + watch.handleDeletion() + } +} + +// Watch represent a particular inotify watch created by inotify_add_watch. +// +// +stateify savable +type Watch struct { + // Inotify instance which owns this watch. + owner *Inotify + + // Descriptor for this watch. This is unique across an inotify instance. + wd int32 + + // set is the watch set containing this watch. It belongs to the target file + // of this watch. + set *Watches + + // Events being monitored via this watch. Must be accessed with atomic + // memory operations. + mask uint32 +} + +// OwnerID returns the id of the inotify instance that owns this watch. +func (w *Watch) OwnerID() uint64 { + return w.owner.id +} + +// ExcludeUnlinkedChildren indicates whether the watched object should continue +// to be notified of events of its children after they have been unlinked, e.g. +// for an open file descriptor. +// +// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK. +// We can do this by keeping track of the set of unlinked children in Watches +// to skip notification. +func (w *Watch) ExcludeUnlinkedChildren() bool { + return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 +} + +// Notify queues a new event on this watch. +func (w *Watch) Notify(name string, events uint32, cookie uint32) { + mask := atomic.LoadUint32(&w.mask) + if mask&events == 0 { + // We weren't watching for this event. + return + } + + // Event mask should include bits matched from the watch plus all control + // event bits. + unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS + effectiveMask := unmaskableBits | mask + matchedEvents := effectiveMask & events + w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) +} + +// handleDeletion handles the deletion of w's target. +func (w *Watch) handleDeletion() { + w.owner.handleDeletion(w) +} + +// Event represents a struct inotify_event from linux. +// +// +stateify savable +type Event struct { + eventEntry + + wd int32 + mask uint32 + cookie uint32 + + // len is computed based on the name field is set automatically by + // Event.setName. It should be 0 when no name is set; otherwise it is the + // length of the name slice. + len uint32 + + // The name field has special padding requirements and should only be set by + // calling Event.setName. + name []byte +} + +func newEvent(wd int32, name string, events, cookie uint32) *Event { + e := &Event{ + wd: wd, + mask: events, + cookie: cookie, + } + if name != "" { + e.setName(name) + } + return e +} + +// paddedBytes converts a go string to a null-terminated c-string, padded with +// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes +// in the 's' plus at least one null byte. +func paddedBytes(s string, l uint32) []byte { + if l < uint32(len(s)+1) { + panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") + } + b := make([]byte, l) + copy(b, s) + + // b was zero-value initialized during make(), so the rest of the slice is + // already filled with null bytes. + + return b +} + +// setName sets the optional name for this event. +func (e *Event) setName(name string) { + // We need to pad the name such that the entire event length ends up a + // multiple of inotifyEventBaseSize. + unpaddedLen := len(name) + 1 + // Round up to nearest multiple of inotifyEventBaseSize. + e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) + // Make sure we haven't overflowed and wrapped around when rounding. + if unpaddedLen > int(e.len) { + panic("Overflow when rounding inotify event size, the 'name' field was too big.") + } + e.name = paddedBytes(name, e.len) +} + +func (e *Event) sizeOf() int { + s := inotifyEventBaseSize + int(e.len) + if s < inotifyEventBaseSize { + panic("overflow") + } + return s +} + +// CopyTo serializes this event to dst. buf is used as a scratch buffer to +// construct the output. We use a buffer allocated ahead of time for +// performance. buf must be at least inotifyEventBaseSize bytes. +func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { + usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) + usermem.ByteOrder.PutUint32(buf[4:], e.mask) + usermem.ByteOrder.PutUint32(buf[8:], e.cookie) + usermem.ByteOrder.PutUint32(buf[12:], e.len) + + writeLen := 0 + + n, err := dst.CopyOut(ctx, buf) + if err != nil { + return 0, err + } + writeLen += n + dst = dst.DropFirst(n) + + if e.len > 0 { + n, err = dst.CopyOut(ctx, e.name) + if err != nil { + return 0, err + } + writeLen += n + } + + // Santiy check. + if writeLen != e.sizeOf() { + panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen)) + } + + return int64(writeLen), nil +} + +func (e *Event) equals(other *Event) bool { + return e.wd == other.wd && + e.mask == other.mask && + e.cookie == other.cookie && + e.len == other.len && + bytes.Equal(e.name, other.name) +} + +// InotifyEventFromStatMask generates the appropriate events for an operation +// that set the stats specified in mask. +func InotifyEventFromStatMask(mask uint32) uint32 { + var ev uint32 + if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 { + ev |= linux.IN_ATTRIB + } + if mask&linux.STATX_SIZE != 0 { + ev |= linux.IN_MODIFY + } + + if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) { + // Both times indicates a utime(s) call. + ev |= linux.IN_ATTRIB + } else if mask&linux.STATX_ATIME != 0 { + ev |= linux.IN_ACCESS + } else if mask&linux.STATX_MTIME != 0 { + mask |= linux.IN_MODIFY + } + return ev +} + +// InotifyRemoveChild sends the appriopriate notifications to the watch sets of +// the child being removed and its parent. +func InotifyRemoveChild(self, parent *Watches, name string) { + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) + // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. +} + +// InotifyRename sends the appriopriate notifications to the watch sets of the +// file being renamed and its old/new parents. +func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) { + var dirEv uint32 + if isDir { + dirEv = linux.IN_ISDIR + } + cookie := uniqueid.InotifyCookie(ctx) + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) + // Somewhat surprisingly, self move events do not have a cookie. + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 02850b65c..3adb7c97d 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -28,9 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// lastMountID is used to allocate mount ids. Must be accessed atomically. -var lastMountID uint64 - // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem // (Mount.fs), which applies to path resolution in the context of a particular @@ -97,7 +94,7 @@ type Mount struct { func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { mnt := &Mount{ - ID: atomic.AddUint64(&lastMountID, 1), + ID: atomic.AddUint64(&vfs.lastMountID, 1), vfs: vfs, fs: fs, root: root, @@ -111,6 +108,16 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount return mnt } +// Options returns a copy of the MountOptions currently applicable to mnt. +func (mnt *Mount) Options() MountOptions { + mnt.vfs.mountMu.Lock() + defer mnt.vfs.mountMu.Unlock() + return MountOptions{ + Flags: mnt.flags, + ReadOnly: mnt.readOnly(), + } +} + // A MountNamespace is a collection of Mounts. // // MountNamespaces are reference-counted. Unless otherwise specified, all @@ -148,7 +155,7 @@ type MountNamespace struct { func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { - ctx.Warningf("Unknown filesystem: %s", fsTypeName) + ctx.Warningf("Unknown filesystem type: %s", fsTypeName) return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) @@ -175,26 +182,34 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, return newMount(vfs, fs, root, nil /* mntns */, opts), nil } -// MountAt creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { +// MountDisconnected creates a Filesystem configured by the given arguments, +// then returns a Mount representing it. The new Mount is not associated with +// any MountNamespace and is not connected to any other Mounts. +func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { - return syserror.ENODEV + return nil, syserror.ENODEV } if !opts.InternalMount && !rft.opts.AllowUserMount { - return syserror.ENODEV + return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { - return err + return nil, err } + defer root.DecRef() + defer fs.DecRef() + return vfs.NewDisconnectedMount(fs, root, opts) +} +// ConnectMountAt connects mnt at the path represented by target. +// +// Preconditions: mnt must be disconnected. +func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { // We can't hold vfs.mountMu while calling FilesystemImpl methods due to // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) if err != nil { - root.DecRef() - fs.DecRef() return err } vfs.mountMu.Lock() @@ -204,8 +219,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() - root.DecRef() - fs.DecRef() return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -238,7 +251,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia // point and the mount root are directories, or neither are, and returns // ENOTDIR if this is not the case. mntns := vd.mount.ns - mnt := newMount(vfs, fs, root, mntns, opts) vfs.mounts.seq.BeginWrite() vfs.connectLocked(mnt, vd, mntns) vfs.mounts.seq.EndWrite() @@ -247,6 +259,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia return nil } +// MountAt creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { + mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) + if err != nil { + return err + } + defer mnt.DecRef() + if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { + return err + } + return nil +} + // UmountAt removes the Mount at the given path. func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { @@ -369,14 +394,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu // references held by vd. // // Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. d.mu must be locked. mnt.parent() == nil. +// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt +// must not already be connected. func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { + if checkInvariants { + if mnt.parent() != nil { + panic("VFS.connectLocked called on connected mount") + } + } + mnt.IncRef() // dropped by callers of umountRecursiveLocked mnt.storeKey(vd) if vd.mount.children == nil { vd.mount.children = make(map[*Mount]struct{}) } vd.mount.children[mnt] = struct{}{} atomic.AddUint32(&vd.dentry.mounts, 1) + mnt.ns = mntns mntns.mountpoints[vd.dentry]++ vfs.mounts.insertSeqed(mnt) vfsmpmounts, ok := vfs.mountpoints[vd.dentry] @@ -394,6 +427,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns // writer critical section. mnt.parent() != nil. func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { vd := mnt.loadKey() + if checkInvariants { + if vd.mount != nil { + panic("VFS.disconnectLocked called on disconnected mount") + } + } mnt.storeKey(VirtualDentry{}) delete(vd.mount.children, mnt) atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index bc7581698..70f850ca4 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.15 +// +build !go1.16 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 8d7f8f8af..52643a7c5 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -82,6 +82,10 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // lastMountID is the last allocated mount ID. lastMountID is accessed + // using atomic memory operations. + lastMountID uint64 + // anonMount is a Mount, not included in mounts or mountpoints, // representing an anonFilesystem. anonMount is used to back // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). @@ -418,6 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } } + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) return fd, nil } if !rp.handleError(err) { diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 101497ed6..e2894f9f5 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -77,7 +77,10 @@ var DefaultOpts = Opts{ // trigger it. const descheduleThreshold = 1 * time.Second -var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") +var ( + stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout") + stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") +) // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. var stackDumpSameTaskPeriod = time.Minute @@ -220,6 +223,9 @@ func (w *Watchdog) waitForStart() { // We are fine. return } + + stuckStartup.Increment() + var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) w.doAction(w.StartupTimeoutAction, false, &buf) @@ -328,8 +334,8 @@ func (w *Watchdog) reportStuckWatchdog() { } // doAction will take the given action. If the action is LogWarning, the stack -// is not always dumpped to the log to prevent log flooding. "forceStack" -// guarantees that the stack will be dumped regarless. +// is not always dumped to the log to prevent log flooding. "forceStack" +// guarantees that the stack will be dumped regardless. func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) { switch action { case LogWarning: |