summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/control/BUILD5
-rw-r--r--pkg/sentry/control/proc.go127
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/task.go17
-rw-r--r--pkg/sentry/fsbridge/BUILD24
-rw-r--r--pkg/sentry/fsbridge/bridge.go54
-rw-r--r--pkg/sentry/fsbridge/fs.go181
-rw-r--r--pkg/sentry/fsbridge/vfs.go134
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs.go4
-rw-r--r--pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go5
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go5
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go6
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go9
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go2
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go12
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go4
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go29
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go3
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go4
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go18
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go20
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go6
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go13
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD1
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go18
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go8
-rw-r--r--pkg/sentry/fsimpl/proc/task.go4
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go16
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go17
-rw-r--r--pkg/sentry/fsimpl/sys/BUILD1
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go7
-rw-r--r--pkg/sentry/fsimpl/sys/sys_test.go7
-rw-r--r--pkg/sentry/fsimpl/testutil/BUILD2
-rw-r--r--pkg/sentry/fsimpl/testutil/kernel.go23
-rw-r--r--pkg/sentry/fsimpl/testutil/testutil.go16
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go10
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go18
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go14
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go5
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go3
-rw-r--r--pkg/sentry/kernel/BUILD3
-rw-r--r--pkg/sentry/kernel/fs_context.go98
-rw-r--r--pkg/sentry/kernel/kernel.go158
-rw-r--r--pkg/sentry/kernel/rseq.go16
-rw-r--r--pkg/sentry/kernel/task.go27
-rw-r--r--pkg/sentry/kernel/task_clone.go11
-rw-r--r--pkg/sentry/kernel/task_context.go2
-rw-r--r--pkg/sentry/kernel/task_exit.go7
-rw-r--r--pkg/sentry/kernel/task_log.go15
-rw-r--r--pkg/sentry/kernel/task_start.go49
-rw-r--r--pkg/sentry/kernel/thread_group.go6
-rw-r--r--pkg/sentry/loader/BUILD2
-rw-r--r--pkg/sentry/loader/elf.go28
-rw-r--r--pkg/sentry/loader/interpreter.go6
-rw-r--r--pkg/sentry/loader/loader.go179
-rw-r--r--pkg/sentry/loader/vdso.go7
-rw-r--r--pkg/sentry/mm/BUILD2
-rw-r--r--pkg/sentry/mm/metadata.go10
-rw-r--r--pkg/sentry/mm/mm.go4
-rw-r--r--pkg/sentry/platform/ring0/pagetables/BUILD2
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go5
-rw-r--r--pkg/sentry/strace/strace.go28
-rw-r--r--pkg/sentry/syscalls/linux/BUILD3
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go3
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go3
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat.go26
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat_amd64.go75
-rw-r--r--pkg/sentry/syscalls/linux/sys_stat_arm64.go77
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go17
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go106
-rw-r--r--pkg/sentry/vfs/BUILD1
-rw-r--r--pkg/sentry/vfs/context.go7
-rw-r--r--pkg/sentry/vfs/dentry.go4
-rw-r--r--pkg/sentry/vfs/device.go3
-rw-r--r--pkg/sentry/vfs/epoll.go19
-rw-r--r--pkg/sentry/vfs/file_description.go10
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go10
-rw-r--r--pkg/sentry/vfs/filesystem.go2
-rw-r--r--pkg/sentry/vfs/filesystem_type.go1
-rw-r--r--pkg/sentry/vfs/mount.go14
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go8
-rw-r--r--pkg/sentry/vfs/options.go7
-rw-r--r--pkg/sentry/vfs/permissions.go19
-rw-r--r--pkg/sentry/vfs/vfs.go57
86 files changed, 1374 insertions, 624 deletions
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index e69496477..d16d78aa5 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -16,10 +16,13 @@ go_library(
],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/fd",
+ "//pkg/fspath",
"//pkg/log",
"//pkg/sentry/fs",
"//pkg/sentry/fs/host",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
@@ -27,8 +30,10 @@ go_library(
"//pkg/sentry/state",
"//pkg/sentry/strace",
"//pkg/sentry/usage",
+ "//pkg/sentry/vfs",
"//pkg/sentry/watchdog",
"//pkg/sync",
+ "//pkg/syserror",
"//pkg/tcpip/link/sniffer",
"//pkg/urpc",
],
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index ced51c66c..5457ba5e7 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -18,19 +18,26 @@ import (
"bytes"
"encoding/json"
"fmt"
+ "path"
"sort"
"strings"
"text/tabwriter"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/urpc"
)
@@ -60,6 +67,12 @@ type ExecArgs struct {
// process's MountNamespace.
MountNamespace *fs.MountNamespace
+ // MountNamespaceVFS2 is the mount namespace to execute the new process in.
+ // A reference on MountNamespace must be held for the lifetime of the
+ // ExecArgs. If MountNamespace is nil, it will default to the init
+ // process's MountNamespace.
+ MountNamespaceVFS2 *vfs.MountNamespace
+
// WorkingDirectory defines the working directory for the new process.
WorkingDirectory string `json:"wd"`
@@ -150,6 +163,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
Envv: args.Envv,
WorkingDirectory: args.WorkingDirectory,
MountNamespace: args.MountNamespace,
+ MountNamespaceVFS2: args.MountNamespaceVFS2,
Credentials: creds,
FDTable: fdTable,
Umask: 0022,
@@ -166,24 +180,53 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
// be donated to the new process in CreateProcess.
initArgs.MountNamespace.IncRef()
}
+ if initArgs.MountNamespaceVFS2 != nil {
+ // initArgs must hold a reference on MountNamespaceVFS2, which will
+ // be donated to the new process in CreateProcess.
+ initArgs.MountNamespaceVFS2.IncRef()
+ }
ctx := initArgs.NewContext(proc.Kernel)
if initArgs.Filename == "" {
- // Get the full path to the filename from the PATH env variable.
- paths := fs.GetPath(initArgs.Envv)
- mns := initArgs.MountNamespace
- if mns == nil {
- mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
- }
- f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
- if err != nil {
- return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+ if kernel.VFS2Enabled {
+ // Get the full path to the filename from the PATH env variable.
+ if initArgs.MountNamespaceVFS2 == nil {
+ // Set initArgs so that 'ctx' returns the namespace.
+ //
+ // MountNamespaceVFS2 adds a reference to the namespace, which is
+ // transferred to the new process.
+ initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
+ }
+
+ paths := fs.GetPath(initArgs.Envv)
+ vfsObj := proc.Kernel.VFS()
+ file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+ if err != nil {
+ return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+ }
+ initArgs.File = fsbridge.NewVFSFile(file)
+ } else {
+ // Get the full path to the filename from the PATH env variable.
+ paths := fs.GetPath(initArgs.Envv)
+ if initArgs.MountNamespace == nil {
+ // Set initArgs so that 'ctx' returns the namespace.
+ initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
+
+ // initArgs must hold a reference on MountNamespace, which will
+ // be donated to the new process in CreateProcess.
+ initArgs.MountNamespaceVFS2.IncRef()
+ }
+ f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+ if err != nil {
+ return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
+ }
+ initArgs.Filename = f
}
- initArgs.Filename = f
}
mounter := fs.FileOwnerFromContext(ctx)
+ // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2.
var ttyFile *fs.File
for appFD, hostFile := range args.FilePayload.Files {
var appFile *fs.File
@@ -411,3 +454,67 @@ func ttyName(tty *kernel.TTY) string {
}
return fmt.Sprintf("pts/%d", tty.Index)
}
+
+// ResolveExecutablePath resolves the given executable name given a set of
+// paths that might contain it.
+func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
+ root := vfs.RootFromContext(ctx)
+ defer root.DecRef()
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Absolute paths can be used directly.
+ if path.IsAbs(name) {
+ return openExecutable(ctx, vfsObj, creds, root, name)
+ }
+
+ // Paths with '/' in them should be joined to the working directory, or
+ // to the root if working directory is not set.
+ if strings.IndexByte(name, '/') > 0 {
+ if len(wd) == 0 {
+ wd = "/"
+ }
+ if !path.IsAbs(wd) {
+ return nil, fmt.Errorf("working directory %q must be absolute", wd)
+ }
+ return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
+ }
+
+ // Otherwise, we must lookup the name in the paths, starting from the
+ // calling context's root directory.
+ for _, p := range paths {
+ if !path.IsAbs(p) {
+ // Relative paths aren't safe, no one should be using them.
+ log.Warningf("Skipping relative path %q in $PATH", p)
+ continue
+ }
+
+ binPath := path.Join(p, name)
+ f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
+ if err != nil {
+ return nil, err
+ }
+ if f == nil {
+ continue // Not found/no access.
+ }
+ return f, nil
+ }
+ return nil, syserror.ENOENT
+}
+
+func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root, // binPath is absolute, Start can be anything.
+ Path: fspath.Parse(path),
+ FollowFinalSymlink: true,
+ }
+ opts := &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ FileExec: true,
+ }
+ f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
+ if err == syserror.ENOENT || err == syserror.EACCES {
+ return nil, nil
+ }
+ return f, err
+}
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 280093c5e..77c2c5c0e 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -36,6 +36,7 @@ go_library(
"//pkg/sentry/fs/proc/device",
"//pkg/sentry/fs/proc/seqfile",
"//pkg/sentry/fs/ramfs",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index ca020e11e..8ab8d8a02 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -28,6 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
"gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -249,7 +250,7 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
return newProcInode(t, exeSymlink, msrc, fs.Symlink, t)
}
-func (e *exe) executable() (d *fs.Dirent, err error) {
+func (e *exe) executable() (file fsbridge.File, err error) {
e.t.WithMuLocked(func(t *kernel.Task) {
mm := t.MemoryManager()
if mm == nil {
@@ -262,8 +263,8 @@ func (e *exe) executable() (d *fs.Dirent, err error) {
// The MemoryManager may be destroyed, in which case
// MemoryManager.destroy will simply set the executable to nil
// (with locks held).
- d = mm.Executable()
- if d == nil {
+ file = mm.Executable()
+ if file == nil {
err = syserror.ENOENT
}
})
@@ -283,15 +284,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
}
defer exec.DecRef()
- root := fs.RootFromContext(ctx)
- if root == nil {
- // This doesn't correspond to anything in Linux because the vfs is
- // global there.
- return "", syserror.EINVAL
- }
- defer root.DecRef()
- n, _ := exec.FullName(root)
- return n, nil
+ return exec.PathnameWithDeleted(ctx), nil
}
// namespaceSymlink represents a symlink in the namespacefs, such as the files
diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD
new file mode 100644
index 000000000..6c798f0bd
--- /dev/null
+++ b/pkg/sentry/fsbridge/BUILD
@@ -0,0 +1,24 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+ name = "fsbridge",
+ srcs = [
+ "bridge.go",
+ "fs.go",
+ "vfs.go",
+ ],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fspath",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/usermem",
+ ],
+)
diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go
new file mode 100644
index 000000000..8e7590721
--- /dev/null
+++ b/pkg/sentry/fsbridge/bridge.go
@@ -0,0 +1,54 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2
+// files.
+package fsbridge
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// File provides a common interface to bridge between VFS1 and VFS2 files.
+type File interface {
+ // PathnameWithDeleted returns an absolute pathname to vd, consistent with
+ // Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+ // PathnameWithDeleted appends " (deleted)" to the returned pathname.
+ PathnameWithDeleted(ctx context.Context) string
+
+ // ReadFull read all contents from the file.
+ ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
+
+ // ConfigureMMap mutates opts to implement mmap(2) for the file.
+ ConfigureMMap(context.Context, *memmap.MMapOpts) error
+
+ // Type returns the file type, e.g. linux.S_IFREG.
+ Type(context.Context) (linux.FileMode, error)
+
+ // IncRef increments reference.
+ IncRef()
+
+ // DecRef decrements reference.
+ DecRef()
+}
+
+// Lookup provides a common interface to open files.
+type Lookup interface {
+ // OpenPath opens a file.
+ OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error)
+}
diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go
new file mode 100644
index 000000000..093ce1fb3
--- /dev/null
+++ b/pkg/sentry/fsbridge/fs.go
@@ -0,0 +1,181 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+ "io"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over fs.File.
+//
+// +stateify savable
+type fsFile struct {
+ file *fs.File
+}
+
+var _ File = (*fsFile)(nil)
+
+// NewFSFile creates a new File over fs.File.
+func NewFSFile(file *fs.File) File {
+ return &fsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *fsFile) PathnameWithDeleted(ctx context.Context) string {
+ root := fs.RootFromContext(ctx)
+ if root == nil {
+ // This doesn't correspond to anything in Linux because the vfs is
+ // global there.
+ return ""
+ }
+ defer root.DecRef()
+
+ name, _ := f.file.Dirent.FullName(root)
+ return name
+}
+
+// ReadFull implements File.
+func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+ var total int64
+ for dst.NumBytes() > 0 {
+ n, err := f.file.Preadv(ctx, dst, offset+total)
+ total += n
+ if err == io.EOF && total != 0 {
+ return total, io.ErrUnexpectedEOF
+ } else if err != nil {
+ return total, err
+ }
+ dst = dst.DropFirst64(n)
+ }
+ return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *fsFile) Type(context.Context) (linux.FileMode, error) {
+ return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil
+}
+
+// IncRef implements File.
+func (f *fsFile) IncRef() {
+ f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *fsFile) DecRef() {
+ f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type fsLookup struct {
+ mntns *fs.MountNamespace
+
+ root *fs.Dirent
+ workingDir *fs.Dirent
+}
+
+var _ Lookup = (*fsLookup)(nil)
+
+// NewFSLookup creates a new Lookup using VFS1.
+func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup {
+ return &fsLookup{
+ mntns: mntns,
+ root: root,
+ workingDir: workingDir,
+ }
+}
+
+// OpenPath implements Lookup.
+func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) {
+ var d *fs.Dirent
+ var err error
+ if resolveFinal {
+ d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals)
+ } else {
+ d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals)
+ }
+ if err != nil {
+ return nil, err
+ }
+ defer d.DecRef()
+
+ if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
+ return nil, syserror.ELOOP
+ }
+
+ fsPerm := openOptionsToPermMask(&opts)
+ if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil {
+ return nil, err
+ }
+
+ // If they claim it's a directory, then make sure.
+ if strings.HasSuffix(path, "/") {
+ if d.Inode.StableAttr.Type != fs.Directory {
+ return nil, syserror.ENOTDIR
+ }
+ }
+
+ if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile {
+ ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type)
+ return nil, syserror.EACCES
+ }
+
+ f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags))
+ if err != nil {
+ return nil, err
+ }
+
+ return &fsFile{file: f}, nil
+}
+
+func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask {
+ mode := opts.Flags & linux.O_ACCMODE
+ return fs.PermMask{
+ Read: mode == linux.O_RDONLY || mode == linux.O_RDWR,
+ Write: mode == linux.O_WRONLY || mode == linux.O_RDWR,
+ Execute: opts.FileExec,
+ }
+}
+
+func flagsToFileFlags(flags uint32) fs.FileFlags {
+ return fs.FileFlags{
+ Direct: flags&linux.O_DIRECT != 0,
+ DSync: flags&(linux.O_DSYNC|linux.O_SYNC) != 0,
+ Sync: flags&linux.O_SYNC != 0,
+ NonBlocking: flags&linux.O_NONBLOCK != 0,
+ Read: (flags & linux.O_ACCMODE) != linux.O_WRONLY,
+ Write: (flags & linux.O_ACCMODE) != linux.O_RDONLY,
+ Append: flags&linux.O_APPEND != 0,
+ Directory: flags&linux.O_DIRECTORY != 0,
+ Async: flags&linux.O_ASYNC != 0,
+ LargeFile: flags&linux.O_LARGEFILE != 0,
+ Truncate: flags&linux.O_TRUNC != 0,
+ }
+}
diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go
new file mode 100644
index 000000000..e657c39bc
--- /dev/null
+++ b/pkg/sentry/fsbridge/vfs.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsbridge
+
+import (
+ "io"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fsFile implements File interface over vfs.FileDescription.
+//
+// +stateify savable
+type vfsFile struct {
+ file *vfs.FileDescription
+}
+
+var _ File = (*vfsFile)(nil)
+
+// NewVFSFile creates a new File over fs.File.
+func NewVFSFile(file *vfs.FileDescription) File {
+ return &vfsFile{file: file}
+}
+
+// PathnameWithDeleted implements File.
+func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string {
+ root := vfs.RootFromContext(ctx)
+ defer root.DecRef()
+
+ vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
+ name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
+ return name
+}
+
+// ReadFull implements File.
+func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+ var total int64
+ for dst.NumBytes() > 0 {
+ n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
+ total += n
+ if err == io.EOF && total != 0 {
+ return total, io.ErrUnexpectedEOF
+ } else if err != nil {
+ return total, err
+ }
+ dst = dst.DropFirst64(n)
+ }
+ return total, nil
+}
+
+// ConfigureMMap implements File.
+func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return f.file.ConfigureMMap(ctx, opts)
+}
+
+// Type implements File.
+func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) {
+ stat, err := f.file.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ return 0, err
+ }
+ return linux.FileMode(stat.Mode).FileType(), nil
+}
+
+// IncRef implements File.
+func (f *vfsFile) IncRef() {
+ f.file.IncRef()
+}
+
+// DecRef implements File.
+func (f *vfsFile) DecRef() {
+ f.file.DecRef()
+}
+
+// fsLookup implements Lookup interface using fs.File.
+//
+// +stateify savable
+type vfsLookup struct {
+ mntns *vfs.MountNamespace
+
+ root vfs.VirtualDentry
+ workingDir vfs.VirtualDentry
+}
+
+var _ Lookup = (*vfsLookup)(nil)
+
+// NewVFSLookup creates a new Lookup using VFS2.
+func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup {
+ return &vfsLookup{
+ mntns: mntns,
+ root: root,
+ workingDir: workingDir,
+ }
+}
+
+// OpenPath implements Lookup.
+//
+// remainingTraversals is not configurable in VFS2, all callers are using the
+// default anyways.
+//
+// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
+func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
+ vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
+ creds := auth.CredentialsFromContext(ctx)
+ pop := &vfs.PathOperation{
+ Root: l.root,
+ Start: l.root,
+ Path: fspath.Parse(path),
+ FollowFinalSymlink: resolveFinal,
+ }
+ fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
+ if err != nil {
+ return nil, err
+ }
+ return &vfsFile{file: fd}, nil
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index e03a0c665..abd4f24e7 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -28,6 +28,9 @@ import (
"gvisor.dev/gvisor/pkg/sync"
)
+// Name is the default filesystem name.
+const Name = "devtmpfs"
+
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct {
initOnce sync.Once
@@ -107,6 +110,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
func (ac *accessorContext) Value(key interface{}) interface{} {
switch key {
case vfs.CtxMountNamespace:
+ ac.a.mntns.IncRef()
return ac.a.mntns
case vfs.CtxRoot:
ac.a.root.IncRef()
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 73308a2b5..b6d52c015 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) {
ctx := contexttest.Context(t)
creds := auth.CredentialsFromContext(ctx)
- vfsObj := vfs.New()
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
// Register tmpfs just so that we can have a root filesystem that isn't
// devtmpfs.
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 2015a8871..89caee3df 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -52,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
creds := auth.CredentialsFromContext(ctx)
// Create VFS.
- vfsObj := vfs.New()
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ return nil, nil, nil, nil, err
+ }
vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index ebb72b75e..bd6ede995 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -188,14 +188,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
}
- if !cb.Handle(vfs.Dirent{
+ if err := cb.Handle(vfs.Dirent{
Name: child.diskDirent.FileName(),
Type: fs.ToDirentType(childType),
Ino: uint64(child.diskDirent.Inode()),
NextOff: fd.off + 1,
- }) {
+ }); err != nil {
dir.childList.InsertBefore(child, fd.iter)
- return nil
+ return err
}
fd.off++
}
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 05f992826..29bb73765 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -65,7 +65,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
creds := auth.CredentialsFromContext(ctx)
// Create VFS.
- vfsObj := vfs.New()
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
@@ -496,9 +499,9 @@ func newIterDirentCb() *iterDirentsCb {
}
// Handle implements vfs.IterDirentsCallback.Handle.
-func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error {
cb.dirents = append(cb.dirents, dirent)
- return true
+ return nil
}
// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 07bf58953..e05429d41 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -296,7 +296,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
return nil, syserror.EROFS
}
- return inode.open(rp, vfsd, opts.Flags)
+ return inode.open(rp, vfsd, &opts)
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 191b39970..6962083f5 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -148,8 +148,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
}
// open creates and returns a file description for the dentry passed in.
-func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
- ats := vfs.AccessTypesForOpenFlags(flags)
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(opts)
if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
@@ -157,7 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
switch in.impl.(type) {
case *regularFile:
var fd regularFileFD
- if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -168,17 +168,17 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
return nil, syserror.EISDIR
}
var fd directoryFD
- if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
case *symlink:
- if flags&linux.O_PATH == 0 {
+ if opts.Flags&linux.O_PATH == 0 {
// Can't open symlinks without O_PATH.
return nil, syserror.ELOOP
}
var fd symlinkFD
- fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+ fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
default:
panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 6d4ebc2bf..5dbfc6250 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -65,8 +65,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
}
for fd.off < int64(len(fd.dirents)) {
- if !cb.Handle(fd.dirents[fd.off]) {
- return nil
+ if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+ return err
}
fd.off++
}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 8eb61debf..5cfb0dc4c 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -400,6 +400,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
}
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
childVFSD := parent.vfsd.Child(name)
@@ -593,7 +594,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
}
if rp.Done() {
- return start.openLocked(ctx, rp, opts.Flags)
+ return start.openLocked(ctx, rp, &opts)
}
afterTrailingSymlink:
@@ -633,12 +634,12 @@ afterTrailingSymlink:
start = parent
goto afterTrailingSymlink
}
- return child.openLocked(ctx, rp, opts.Flags)
+ return child.openLocked(ctx, rp, &opts)
}
// Preconditions: fs.renameMu must be locked.
-func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) {
- ats := vfs.AccessTypesForOpenFlags(flags)
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(opts)
if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil {
return nil, err
}
@@ -646,11 +647,11 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
filetype := d.fileType()
switch {
case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD:
- if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil {
+ if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil {
return nil, err
}
fd := &regularFileFD{}
- if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
AllowDirectIO: true,
}); err != nil {
return nil, err
@@ -658,21 +659,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
return &fd.vfsfd, nil
case filetype == linux.S_IFDIR:
// Can't open directories with O_CREAT.
- if flags&linux.O_CREAT != 0 {
+ if opts.Flags&linux.O_CREAT != 0 {
return nil, syserror.EISDIR
}
// Can't open directories writably.
if ats&vfs.MayWrite != 0 {
return nil, syserror.EISDIR
}
- if flags&linux.O_DIRECT != 0 {
+ if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
return nil, err
}
fd := &directoryFD{}
- if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
return nil, err
}
return &fd.vfsfd, nil
@@ -680,17 +681,17 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui
// Can't open symlinks without O_PATH (which is unimplemented).
return nil, syserror.ELOOP
default:
- if flags&linux.O_DIRECT != 0 {
+ if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
- h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0)
+ h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0)
if err != nil {
return nil, err
}
fd := &specialFileFD{
handle: h,
}
- if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
h.close(ctx)
return nil, err
}
@@ -934,7 +935,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if oldParent == newParent && oldName == newName {
return nil
}
- if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil {
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
return err
}
if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index d0552bd99..d00850e25 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -52,6 +52,9 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// Name is the default filesystem name.
+const Name = "9p"
+
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 733792c78..d092ccb2a 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -53,9 +53,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy
}
// Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &DynamicBytesFD{}
- if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil {
+ if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil {
return nil, err
}
return &fd.vfsfd, nil
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 6104751c8..5650512e0 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -43,12 +43,12 @@ type GenericDirectoryFD struct {
}
// Init initializes a GenericDirectoryFD.
-func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error {
- if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 {
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error {
+ if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
// Can't open directories for writing.
return syserror.EISDIR
}
- if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
return err
}
fd.children = children
@@ -116,8 +116,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
Ino: stat.Ino,
NextOff: 1,
}
- if !cb.Handle(dirent) {
- return nil
+ if err := cb.Handle(dirent); err != nil {
+ return err
}
fd.off++
}
@@ -132,8 +132,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
Ino: stat.Ino,
NextOff: 2,
}
- if !cb.Handle(dirent) {
- return nil
+ if err := cb.Handle(dirent); err != nil {
+ return err
}
fd.off++
}
@@ -153,8 +153,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
Ino: stat.Ino,
NextOff: fd.off + 1,
}
- if !cb.Handle(dirent) {
- return nil
+ if err := cb.Handle(dirent); err != nil {
+ return err
}
fd.off++
}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index e49303c26..292f58afd 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -365,7 +365,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
// appropriate bits in rp), but are returned by
// FileDescriptionImpl.StatusFlags().
opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
- ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+ ats := vfs.AccessTypesForOpenFlags(&opts)
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
@@ -379,7 +379,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- return inode.Open(rp, vfsd, opts.Flags)
+ return inode.Open(rp, vfsd, opts)
}
// May create new file.
@@ -398,7 +398,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- return inode.Open(rp, vfsd, opts.Flags)
+ return inode.Open(rp, vfsd, opts)
}
afterTrailingSymlink:
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
@@ -438,7 +438,7 @@ afterTrailingSymlink:
return nil, err
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
- return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
+ return child.Impl().(*Dentry).inode.Open(rp, child, opts)
}
// Open existing file or follow symlink.
if mustCreate {
@@ -463,7 +463,7 @@ afterTrailingSymlink:
if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
return nil, err
}
- return childInode.Open(rp, childVFSD, opts.Flags)
+ return childInode.Open(rp, childVFSD, opts)
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -544,6 +544,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
virtfs := rp.VirtualFilesystem()
srcDirDentry := srcDirVFSD.Impl().(*Dentry)
@@ -595,7 +596,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
parentDentry := vfsd.Parent().Impl().(*Dentry)
parentDentry.dirMu.Lock()
defer parentDentry.dirMu.Unlock()
- if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
+ if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
@@ -697,7 +701,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
parentDentry := vfsd.Parent().Impl().(*Dentry)
parentDentry.dirMu.Lock()
defer parentDentry.dirMu.Unlock()
- if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
+ if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
return err
}
if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index adca2313f..099d70a16 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -507,7 +507,7 @@ type InodeSymlink struct {
}
// Open implements Inode.Open.
-func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
return nil, syserror.ELOOP
}
@@ -549,8 +549,8 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F
}
// Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &GenericDirectoryFD{}
- fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags)
+ fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts)
return fd.VFSFileDescription(), nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 79ebea8a5..c74fa999b 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -303,7 +303,7 @@ type Inode interface {
// inode for its lifetime.
//
// Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
- Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error)
+ Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
}
type inodeRefs interface {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index ee65cf491..0459fb305 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
ctx := contexttest.Context(t)
creds := auth.CredentialsFromContext(ctx)
- v := vfs.New()
+ v := &vfs.VirtualFilesystem{}
+ if err := v.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
@@ -113,9 +116,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod
return &dir.dentry
}
-func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &kernfs.GenericDirectoryFD{}
- if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil {
+ if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil {
return nil, err
}
return fd.VFSFileDescription(), nil
@@ -143,9 +146,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte
return &dir.dentry
}
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &kernfs.GenericDirectoryFD{}
- fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+ fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
return fd.VFSFileDescription(), nil
}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 12aac2e6a..a83245866 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -14,6 +14,7 @@ go_library(
"tasks_net.go",
"tasks_sys.go",
],
+ visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 11477b6a9..5c19d5522 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -26,15 +26,18 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
-// procFSType is the factory class for procfs.
+// Name is the default filesystem name.
+const Name = "proc"
+
+// FilesystemType is the factory class for procfs.
//
// +stateify savable
-type procFSType struct{}
+type FilesystemType struct{}
-var _ vfs.FilesystemType = (*procFSType)(nil)
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
// GetFilesystem implements vfs.FilesystemType.
-func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
k := kernel.KernelFromContext(ctx)
if k == nil {
return nil, nil, fmt.Errorf("procfs requires a kernel")
@@ -47,12 +50,13 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile
procfs := &kernfs.Filesystem{}
procfs.VFSFilesystem().Init(vfsObj, procfs)
- var data *InternalData
+ var cgroups map[string]string
if opts.InternalData != nil {
- data = opts.InternalData.(*InternalData)
+ data := opts.InternalData.(*InternalData)
+ cgroups = data.Cgroups
}
- _, dentry := newTasksInode(procfs, k, pidns, data.Cgroups)
+ _, dentry := newTasksInode(procfs, k, pidns, cgroups)
return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 353e37195..f3f4e49b4 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -105,8 +105,8 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
Ino: i.inoGen.NextIno(),
NextOff: offset + 1,
}
- if !cb.Handle(dirent) {
- return offset, nil
+ if err := cb.Handle(dirent); err != nil {
+ return offset, err
}
offset++
}
@@ -114,9 +114,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
}
// Open implements kernfs.Inode.
-func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &kernfs.GenericDirectoryFD{}
- fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+ fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
return fd.VFSFileDescription(), nil
}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index eb5bc62c0..2d814668a 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -98,9 +98,9 @@ func (i *taskInode) Valid(ctx context.Context) bool {
}
// Open implements kernfs.Inode.
-func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &kernfs.GenericDirectoryFD{}
- fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+ fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
return fd.VFSFileDescription(), nil
}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 14bd334e8..ce08a7d53 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -151,8 +151,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
Ino: i.inoGen.NextIno(),
NextOff: offset + 1,
}
- if !cb.Handle(dirent) {
- return offset, nil
+ if err := cb.Handle(dirent); err != nil {
+ return offset, err
}
offset++
}
@@ -163,8 +163,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
Ino: i.inoGen.NextIno(),
NextOff: offset + 1,
}
- if !cb.Handle(dirent) {
- return offset, nil
+ if err := cb.Handle(dirent); err != nil {
+ return offset, err
}
offset++
}
@@ -196,8 +196,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
Ino: i.inoGen.NextIno(),
NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
}
- if !cb.Handle(dirent) {
- return offset, nil
+ if err := cb.Handle(dirent); err != nil {
+ return offset, err
}
offset++
}
@@ -205,9 +205,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
}
// Open implements kernfs.Inode.
-func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &kernfs.GenericDirectoryFD{}
- fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+ fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts)
return fd.VFSFileDescription(), nil
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 6fc3524db..c5d531fe0 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -90,8 +90,7 @@ func setup(t *testing.T) *testutil.System {
ctx := k.SupervisorContext()
creds := auth.CredentialsFromContext(ctx)
- vfsObj := vfs.New()
- vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+ k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
fsOpts := vfs.GetFilesystemOptions{
@@ -102,11 +101,11 @@ func setup(t *testing.T) *testutil.System {
},
},
}
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts)
+ mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts)
if err != nil {
t.Fatalf("NewMountNamespace(): %v", err)
}
- return testutil.NewSystem(ctx, t, vfsObj, mntns)
+ return testutil.NewSystem(ctx, t, k.VFS(), mntns)
}
func TestTasksEmpty(t *testing.T) {
@@ -131,7 +130,7 @@ func TestTasks(t *testing.T) {
var tasks []*kernel.Task
for i := 0; i < 5; i++ {
tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
- task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+ task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
if err != nil {
t.Fatalf("CreateTask(): %v", err)
}
@@ -213,7 +212,7 @@ func TestTasksOffset(t *testing.T) {
k := kernel.KernelFromContext(s.Ctx)
for i := 0; i < 3; i++ {
tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
- if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+ if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil {
t.Fatalf("CreateTask(): %v", err)
}
}
@@ -337,7 +336,7 @@ func TestTask(t *testing.T) {
k := kernel.KernelFromContext(s.Ctx)
tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
- _, err := testutil.CreateTask(s.Ctx, "name", tc)
+ _, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
if err != nil {
t.Fatalf("CreateTask(): %v", err)
}
@@ -352,7 +351,7 @@ func TestProcSelf(t *testing.T) {
k := kernel.KernelFromContext(s.Ctx)
tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
- task, err := testutil.CreateTask(s.Ctx, "name", tc)
+ task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
if err != nil {
t.Fatalf("CreateTask(): %v", err)
}
@@ -433,7 +432,7 @@ func TestTree(t *testing.T) {
var tasks []*kernel.Task
for i := 0; i < 5; i++ {
tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
- task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc)
+ task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
if err != nil {
t.Fatalf("CreateTask(): %v", err)
}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 66c0d8bc8..a741e2bb6 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -7,6 +7,7 @@ go_library(
srcs = [
"sys.go",
],
+ visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
"//pkg/context",
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index e35d52d17..c36c4fa11 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -28,6 +28,9 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// Name is the default filesystem name.
+const Name = "sysfs"
+
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
@@ -97,9 +100,9 @@ func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error {
}
// Open implements kernfs.Inode.Open.
-func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &kernfs.GenericDirectoryFD{}
- fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+ fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts)
return fd.VFSFileDescription(), nil
}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 8b1cf0bd0..4b3602d47 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System {
}
ctx := k.SupervisorContext()
creds := auth.CredentialsFromContext(ctx)
- v := vfs.New()
- v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
- mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{})
+ mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
if err != nil {
t.Fatalf("Failed to create new mount namespace: %v", err)
}
- return testutil.NewSystem(ctx, t, v, mns)
+ return testutil.NewSystem(ctx, t, k.VFS(), mns)
}
func TestReadCPUFile(t *testing.T) {
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
index efd5974c4..e4f36f4ae 100644
--- a/pkg/sentry/fsimpl/testutil/BUILD
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -16,7 +16,7 @@ go_library(
"//pkg/cpuid",
"//pkg/fspath",
"//pkg/memutil",
- "//pkg/sentry/fs",
+ "//pkg/sentry/fsimpl/tmpfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/sched",
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 89f8c4915..d0be32e72 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -24,7 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/memutil"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
@@ -33,6 +33,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
// Platforms are plugable.
_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
@@ -99,26 +100,28 @@ func Boot() (*kernel.Kernel, error) {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
- ctx := k.SupervisorContext()
+ kernel.VFS2Enabled = true
- // Create mount namespace without root as it's the minimum required to create
- // the global thread group.
- mntns, err := fs.NewMountNamespace(ctx, nil)
- if err != nil {
- return nil, err
+ if err := k.VFS().Init(); err != nil {
+ return nil, fmt.Errorf("VFS init: %v", err)
}
+ k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
+
ls, err := limits.NewLinuxLimitSet()
if err != nil {
return nil, err
}
- tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+ tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
k.TestOnly_SetGlobalInit(tg)
return k, nil
}
// CreateTask creates a new bare bones task for tests.
-func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
k := kernel.KernelFromContext(ctx)
config := &kernel.TaskConfig{
Kernel: k,
@@ -129,6 +132,8 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kern
UTSNamespace: kernel.UTSNamespaceFromContext(ctx),
IPCNamespace: kernel.IPCNamespaceFromContext(ctx),
AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+ MountNamespaceVFS2: mntns,
+ FSContext: kernel.NewFSContextVFS2(root, cwd, 0022),
}
return k.TaskSet().NewTask(config)
}
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 69fd84ddd..e16808c63 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -41,12 +41,12 @@ type System struct {
Creds *auth.Credentials
VFS *vfs.VirtualFilesystem
Root vfs.VirtualDentry
- mns *vfs.MountNamespace
+ MntNs *vfs.MountNamespace
}
// NewSystem constructs a System.
//
-// Precondition: Caller must hold a reference on mns, whose ownership
+// Precondition: Caller must hold a reference on MntNs, whose ownership
// is transferred to the new System.
func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
s := &System{
@@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns
Ctx: ctx,
Creds: auth.CredentialsFromContext(ctx),
VFS: v,
- mns: mns,
+ MntNs: mns,
Root: mns.Root(),
}
return s
@@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System {
Ctx: s.Ctx,
Creds: s.Creds,
VFS: s.VFS,
- mns: s.mns,
+ MntNs: s.MntNs,
Root: s.Root,
}
}
@@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
Ctx: ctx,
Creds: s.Creds,
VFS: s.VFS,
- mns: s.mns,
+ MntNs: s.MntNs,
Root: s.Root,
}
}
@@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System {
// Destroy release resources associated with a test system.
func (s *System) Destroy() {
s.Root.DecRef()
- s.mns.DecRef() // Reference on mns passed to NewSystem.
+ s.MntNs.DecRef() // Reference on MntNs passed to NewSystem.
}
// ReadToEnd reads the contents of fd until EOF to a string.
@@ -226,7 +226,7 @@ func (d *DirentCollector) SkipDotsChecks(value bool) {
}
// Handle implements vfs.IterDirentsCallback.Handle.
-func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
+func (d *DirentCollector) Handle(dirent vfs.Dirent) error {
d.mu.Lock()
if d.dirents == nil {
d.dirents = make(map[string]*vfs.Dirent)
@@ -234,7 +234,7 @@ func (d *DirentCollector) Handle(dirent vfs.Dirent) bool {
d.order = append(d.order, &dirent)
d.dirents[dirent.Name] = &dirent
d.mu.Unlock()
- return true
+ return nil
}
// Count returns the number of dirents currently in the collector.
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 9fce5e4b4..383133e44 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -175,7 +175,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
creds := auth.CredentialsFromContext(ctx)
// Create VFS.
- vfsObj := vfs.New()
+ vfsObj := vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ b.Fatalf("VFS init: %v", err)
+ }
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
@@ -366,7 +369,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
creds := auth.CredentialsFromContext(ctx)
// Create VFS.
- vfsObj := vfs.New()
+ vfsObj := vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ b.Fatalf("VFS init: %v", err)
+ }
vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index dc0d27cf9..b4380af38 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -74,25 +74,25 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
defer fs.mu.Unlock()
if fd.off == 0 {
- if !cb.Handle(vfs.Dirent{
+ if err := cb.Handle(vfs.Dirent{
Name: ".",
Type: linux.DT_DIR,
Ino: vfsd.Impl().(*dentry).inode.ino,
NextOff: 1,
- }) {
- return nil
+ }); err != nil {
+ return err
}
fd.off++
}
if fd.off == 1 {
parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
- if !cb.Handle(vfs.Dirent{
+ if err := cb.Handle(vfs.Dirent{
Name: "..",
Type: parentInode.direntType(),
Ino: parentInode.ino,
NextOff: 2,
- }) {
- return nil
+ }); err != nil {
+ return err
}
fd.off++
}
@@ -111,14 +111,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
for child != nil {
// Skip other directoryFD iterators.
if child.inode != nil {
- if !cb.Handle(vfs.Dirent{
+ if err := cb.Handle(vfs.Dirent{
Name: child.vfsd.Name(),
Type: child.inode.direntType(),
Ino: child.inode.ino,
NextOff: fd.off + 1,
- }) {
+ }); err != nil {
dir.childList.InsertBefore(child, fd.iter)
- return nil
+ return err
}
fd.off++
}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 72bc15264..7f7b791c4 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -334,7 +334,7 @@ afterTrailingSymlink:
}
func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
- ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+ ats := vfs.AccessTypesForOpenFlags(opts)
if !afterCreate {
if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
return nil, err
@@ -486,7 +486,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
vfsObj := rp.VirtualFilesystem()
oldParentDir := oldParent.inode.impl.(*directory)
newParentDir := newParent.inode.impl.(*directory)
- if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
+ if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil {
return err
}
if replaced != nil {
@@ -543,7 +545,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
defer mnt.EndWrite()
vfsObj := rp.VirtualFilesystem()
- if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
+ if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
return err
}
parent.inode.impl.(*directory).childList.Remove(child)
@@ -631,7 +635,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
defer mnt.EndWrite()
vfsObj := rp.VirtualFilesystem()
- if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef()
+ if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
return err
}
parent.inode.impl.(*directory).childList.Remove(child)
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 5ee7f2a72..1614f2c39 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
creds := auth.CredentialsFromContext(ctx)
// Create VFS.
- vfsObj := vfs.New()
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index e9f71e334..0399725cf 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -40,7 +40,11 @@ var nextFileID int64
func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
creds := auth.CredentialsFromContext(ctx)
- vfsObj := vfs.New()
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+ }
+
vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
AllowUserMount: true,
})
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 2108d0f4d..c5bb17562 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -40,6 +40,9 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
+// Name is the default filesystem name.
+const Name = "tmpfs"
+
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 2231d6973..beba29a09 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -157,6 +157,7 @@ go_library(
"//pkg/context",
"//pkg/cpuid",
"//pkg/eventchannel",
+ "//pkg/fspath",
"//pkg/log",
"//pkg/metric",
"//pkg/refs",
@@ -167,6 +168,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/lock",
"//pkg/sentry/fs/timerfd",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/hostcpu",
"//pkg/sentry/inet",
"//pkg/sentry/kernel/auth",
@@ -199,6 +201,7 @@ go_library(
"//pkg/tcpip/stack",
"//pkg/usermem",
"//pkg/waiter",
+ "//tools/go_marshal/marshal",
],
)
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 2448c1d99..7218aa24e 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -19,6 +19,7 @@ import (
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -37,10 +38,16 @@ type FSContext struct {
// destroyed.
root *fs.Dirent
+ // rootVFS2 is the filesystem root.
+ rootVFS2 vfs.VirtualDentry
+
// cwd is the current working directory. Will be nil iff the FSContext
// has been destroyed.
cwd *fs.Dirent
+ // cwdVFS2 is the current working directory.
+ cwdVFS2 vfs.VirtualDentry
+
// umask is the current file mode creation mask. When a thread using this
// context invokes a syscall that creates a file, bits set in umask are
// removed from the permissions that the file is created with.
@@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
return &f
}
+// NewFSContextVFS2 returns a new filesystem context.
+func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
+ root.IncRef()
+ cwd.IncRef()
+ f := FSContext{
+ rootVFS2: root,
+ cwdVFS2: cwd,
+ umask: umask,
+ }
+ f.EnableLeakCheck("kernel.FSContext")
+ return &f
+}
+
// destroy is the destructor for an FSContext.
//
// This will call DecRef on both root and cwd Dirents. If either call to
@@ -75,11 +95,17 @@ func (f *FSContext) destroy() {
f.mu.Lock()
defer f.mu.Unlock()
- f.root.DecRef()
- f.root = nil
-
- f.cwd.DecRef()
- f.cwd = nil
+ if VFS2Enabled {
+ f.rootVFS2.DecRef()
+ f.rootVFS2 = vfs.VirtualDentry{}
+ f.cwdVFS2.DecRef()
+ f.cwdVFS2 = vfs.VirtualDentry{}
+ } else {
+ f.root.DecRef()
+ f.root = nil
+ f.cwd.DecRef()
+ f.cwd = nil
+ }
}
// DecRef implements RefCounter.DecRef with destructor f.destroy.
@@ -93,12 +119,21 @@ func (f *FSContext) DecRef() {
func (f *FSContext) Fork() *FSContext {
f.mu.Lock()
defer f.mu.Unlock()
- f.cwd.IncRef()
- f.root.IncRef()
+
+ if VFS2Enabled {
+ f.cwdVFS2.IncRef()
+ f.rootVFS2.IncRef()
+ } else {
+ f.cwd.IncRef()
+ f.root.IncRef()
+ }
+
return &FSContext{
- cwd: f.cwd,
- root: f.root,
- umask: f.umask,
+ cwd: f.cwd,
+ root: f.root,
+ cwdVFS2: f.cwdVFS2,
+ rootVFS2: f.rootVFS2,
+ umask: f.umask,
}
}
@@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext {
func (f *FSContext) WorkingDirectory() *fs.Dirent {
f.mu.Lock()
defer f.mu.Unlock()
- if f.cwd != nil {
- f.cwd.IncRef()
- }
+
+ f.cwd.IncRef()
return f.cwd
}
+// WorkingDirectoryVFS2 returns the current working directory.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ f.cwdVFS2.IncRef()
+ return f.cwdVFS2
+}
+
// SetWorkingDirectory sets the current working directory.
// This will take an extra reference on the Dirent.
//
@@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) {
old.DecRef()
}
+// SetWorkingDirectoryVFS2 sets the current working directory.
+// This will take an extra reference on the VirtualDentry.
+//
+// This is not a valid call after destroy.
+func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ old := f.cwdVFS2
+ f.cwdVFS2 = d
+ d.IncRef()
+ old.DecRef()
+}
+
// RootDirectory returns the current filesystem root.
//
// This will return nil if called after destroy(), otherwise it will return a
@@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
return f.root
}
+// RootDirectoryVFS2 returns the current filesystem root.
+//
+// This will return nil if called after destroy(), otherwise it will return a
+// Dirent with a reference taken.
+func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ f.rootVFS2.IncRef()
+ return f.rootVFS2
+}
+
// SetRootDirectory sets the root directory.
// This will take an extra reference on the Dirent.
//
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3ee760ba2..7da0368f1 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -43,11 +43,13 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/eventchannel"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -71,6 +73,10 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
)
+// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
+// easy access everywhere. To be removed once VFS2 becomes the default.
+var VFS2Enabled = false
+
// Kernel represents an emulated Linux kernel. It must be initialized by calling
// Init() or LoadFrom().
//
@@ -238,6 +244,9 @@ type Kernel struct {
// SpecialOpts contains special kernel options.
SpecialOpts
+
+ // VFS keeps the filesystem state used across the kernel.
+ vfs vfs.VirtualFilesystem
}
// InitKernelArgs holds arguments to Init.
@@ -624,7 +633,7 @@ type CreateProcessArgs struct {
// File is a passed host FD pointing to a file to load as the init binary.
//
// This is checked if and only if Filename is "".
- File *fs.File
+ File fsbridge.File
// Argvv is a list of arguments.
Argv []string
@@ -673,6 +682,13 @@ type CreateProcessArgs struct {
// increment it).
MountNamespace *fs.MountNamespace
+ // MountNamespaceVFS2 optionally contains the mount namespace for this
+ // process. If nil, the init process's mount namespace is used.
+ //
+ // Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
+ // increment it).
+ MountNamespaceVFS2 *vfs.MountNamespace
+
// ContainerID is the container that the process belongs to.
ContainerID string
}
@@ -711,11 +727,22 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
return ctx.args.Credentials
case fs.CtxRoot:
if ctx.args.MountNamespace != nil {
- // MountNamespace.Root() will take a reference on the root
- // dirent for us.
+ // MountNamespace.Root() will take a reference on the root dirent for us.
return ctx.args.MountNamespace.Root()
}
return nil
+ case vfs.CtxRoot:
+ if ctx.args.MountNamespaceVFS2 == nil {
+ return nil
+ }
+ // MountNamespaceVFS2.Root() takes a reference on the root dirent for us.
+ return ctx.args.MountNamespaceVFS2.Root()
+ case vfs.CtxMountNamespace:
+ if ctx.k.globalInit == nil {
+ return nil
+ }
+ // MountNamespaceVFS2 takes a reference for us.
+ return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock:
@@ -757,34 +784,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
defer k.extMu.Unlock()
log.Infof("EXEC: %v", args.Argv)
- // Grab the mount namespace.
- mounts := args.MountNamespace
- if mounts == nil {
- mounts = k.GlobalInit().Leader().MountNamespace()
- mounts.IncRef()
- }
-
- tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
ctx := args.NewContext(k)
- // Get the root directory from the MountNamespace.
- root := mounts.Root()
- // The call to newFSContext below will take a reference on root, so we
- // don't need to hold this one.
- defer root.DecRef()
-
- // Grab the working directory.
- remainingTraversals := uint(args.MaxSymlinkTraversals)
- wd := root // Default.
- if args.WorkingDirectory != "" {
- var err error
- wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
- if err != nil {
- return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+ var (
+ opener fsbridge.Lookup
+ fsContext *FSContext
+ mntns *fs.MountNamespace
+ )
+
+ if VFS2Enabled {
+ mntnsVFS2 := args.MountNamespaceVFS2
+ if mntnsVFS2 == nil {
+ // MountNamespaceVFS2 adds a reference to the namespace, which is
+ // transferred to the new process.
+ mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2()
+ }
+ // Get the root directory from the MountNamespace.
+ root := args.MountNamespaceVFS2.Root()
+ // The call to newFSContext below will take a reference on root, so we
+ // don't need to hold this one.
+ defer root.DecRef()
+
+ // Grab the working directory.
+ wd := root // Default.
+ if args.WorkingDirectory != "" {
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: wd,
+ Path: fspath.Parse(args.WorkingDirectory),
+ FollowFinalSymlink: true,
+ }
+ var err error
+ wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
+ CheckSearchable: true,
+ })
+ if err != nil {
+ return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+ }
+ defer wd.DecRef()
+ }
+ opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
+ fsContext = NewFSContextVFS2(root, wd, args.Umask)
+
+ } else {
+ mntns = args.MountNamespace
+ if mntns == nil {
+ mntns = k.GlobalInit().Leader().MountNamespace()
+ mntns.IncRef()
+ }
+ // Get the root directory from the MountNamespace.
+ root := mntns.Root()
+ // The call to newFSContext below will take a reference on root, so we
+ // don't need to hold this one.
+ defer root.DecRef()
+
+ // Grab the working directory.
+ remainingTraversals := args.MaxSymlinkTraversals
+ wd := root // Default.
+ if args.WorkingDirectory != "" {
+ var err error
+ wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+ if err != nil {
+ return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
+ }
+ defer wd.DecRef()
}
- defer wd.DecRef()
+ opener = fsbridge.NewFSLookup(mntns, root, wd)
+ fsContext = newFSContext(root, wd, args.Umask)
}
+ tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
+
// Check which file to start from.
switch {
case args.Filename != "":
@@ -805,11 +875,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
}
// Create a fresh task context.
- remainingTraversals = uint(args.MaxSymlinkTraversals)
+ remainingTraversals := args.MaxSymlinkTraversals
loadArgs := loader.LoadArgs{
- Mounts: mounts,
- Root: root,
- WorkingDirectory: wd,
+ Opener: opener,
RemainingTraversals: &remainingTraversals,
ResolveFinal: true,
Filename: args.Filename,
@@ -834,13 +902,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
Kernel: k,
ThreadGroup: tg,
TaskContext: tc,
- FSContext: newFSContext(root, wd, args.Umask),
+ FSContext: fsContext,
FDTable: args.FDTable,
Credentials: args.Credentials,
AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
UTSNamespace: args.UTSNamespace,
IPCNamespace: args.IPCNamespace,
AbstractSocketNamespace: args.AbstractSocketNamespace,
+ MountNamespaceVFS2: args.MountNamespaceVFS2,
ContainerID: args.ContainerID,
}
t, err := k.tasks.NewTask(config)
@@ -1100,6 +1169,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
k.sendExternalSignal(info, context)
}
+// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
+// This function doesn't skip signals like SendExternalSignal does.
+func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ return tg.SendSignal(info)
+}
+
// SendContainerSignal sends the given signal to all processes inside the
// namespace that match the given container ID.
func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
@@ -1378,6 +1455,20 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return ctx.k.globalInit.mounts.Root()
}
return nil
+ case vfs.CtxRoot:
+ if ctx.k.globalInit == nil {
+ return vfs.VirtualDentry{}
+ }
+ mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
+ defer mntns.DecRef()
+ // Root() takes a reference on the root dirent for us.
+ return mntns.Root()
+ case vfs.CtxMountNamespace:
+ if ctx.k.globalInit == nil {
+ return nil
+ }
+ // MountNamespaceVFS2() takes a reference for us.
+ return ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock:
@@ -1423,3 +1514,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
Registers: t.Arch().StateData().Proto(),
})
}
+
+// VFS returns the virtual filesystem for the kernel.
+func (k *Kernel) VFS() *vfs.VirtualFilesystem {
+ return &k.vfs
+}
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index efebfd872..18416643b 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() {
return
}
- buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
- if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+ var cs linux.RSeqCriticalSection
+ if _, err := cs.CopyIn(t, critAddr); err != nil {
t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
return
}
- // Manually marshal RSeqCriticalSection as this is in the hot path when
- // rseq is enabled. It must be as fast as possible.
- //
- // TODO(b/130243041): Replace with go_marshal.
- cs := linux.RSeqCriticalSection{
- Version: usermem.ByteOrder.Uint32(buf[0:4]),
- Flags: usermem.ByteOrder.Uint32(buf[4:8]),
- Start: usermem.ByteOrder.Uint64(buf[8:16]),
- PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
- Abort: usermem.ByteOrder.Uint64(buf[24:32]),
- }
-
if cs.Version != 0 {
t.Debugf("Unknown version in %+v", cs)
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 981e8c7fe..a3443ff21 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -424,6 +424,11 @@ type Task struct {
// abstractSockets is protected by mu.
abstractSockets *AbstractSocketNamespace
+ // mountNamespaceVFS2 is the task's mount namespace.
+ //
+ // It is protected by mu. It is owned by the task goroutine.
+ mountNamespaceVFS2 *vfs.MountNamespace
+
// parentDeathSignal is sent to this task's thread group when its parent exits.
//
// parentDeathSignal is protected by mu.
@@ -638,6 +643,11 @@ func (t *Task) Value(key interface{}) interface{} {
return int32(t.ThreadGroup().ID())
case fs.CtxRoot:
return t.fsContext.RootDirectory()
+ case vfs.CtxRoot:
+ return t.fsContext.RootDirectoryVFS2()
+ case vfs.CtxMountNamespace:
+ t.mountNamespaceVFS2.IncRef()
+ return t.mountNamespaceVFS2
case fs.CtxDirentCacheLimiter:
return t.k.DirentCacheLimiter
case inet.CtxStack:
@@ -701,6 +711,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) IsChrooted() bool {
+ if VFS2Enabled {
+ realRoot := t.mountNamespaceVFS2.Root()
+ defer realRoot.DecRef()
+ root := t.fsContext.RootDirectoryVFS2()
+ defer root.DecRef()
+ return root != realRoot
+ }
+
realRoot := t.tg.mounts.Root()
defer realRoot.DecRef()
root := t.fsContext.RootDirectory()
@@ -796,6 +814,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace {
return t.tg.mounts
}
+// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
+// returned mount namespace.
+func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.mountNamespaceVFS2.IncRef()
+ return t.mountNamespaceVFS2
+}
+
// AbstractSockets returns t's AbstractSocketNamespace.
func (t *Task) AbstractSockets() *AbstractSocketNamespace {
return t.abstractSockets
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 53d4d211b..ba74b4c1c 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -199,6 +199,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
ipcns = NewIPCNamespace(userns)
}
+ // TODO(b/63601033): Implement CLONE_NEWNS.
+ mntnsVFS2 := t.mountNamespaceVFS2
+ if mntnsVFS2 != nil {
+ mntnsVFS2.IncRef()
+ }
+
tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
if err != nil {
return 0, nil, err
@@ -241,7 +247,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
rseqAddr := usermem.Addr(0)
rseqSignature := uint32(0)
if opts.NewThreadGroup {
- tg.mounts.IncRef()
+ if tg.mounts != nil {
+ tg.mounts.IncRef()
+ }
sh := t.tg.signalHandlers
if opts.NewSignalHandlers {
sh = sh.Fork()
@@ -265,6 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
UTSNamespace: utsns,
IPCNamespace: ipcns,
AbstractSocketNamespace: t.abstractSockets,
+ MountNamespaceVFS2: mntnsVFS2,
RSeqAddr: rseqAddr,
RSeqSignature: rseqSignature,
ContainerID: t.ContainerID(),
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 2d6e7733c..2be982684 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -136,7 +136,7 @@ func (t *Task) Stack() *arch.Stack {
func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
// If File is not nil, we should load that instead of resolving Filename.
if args.File != nil {
- args.Filename = args.File.MappedName(ctx)
+ args.Filename = args.File.PathnameWithDeleted(ctx)
}
// Prepare a new user address space to load into.
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 435761e5a..c4ade6e8e 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.fsContext.DecRef()
t.fdTable.DecRef()
+ t.mu.Lock()
+ if t.mountNamespaceVFS2 != nil {
+ t.mountNamespaceVFS2.DecRef()
+ t.mountNamespaceVFS2 = nil
+ }
+ t.mu.Unlock()
+
// If this is the last task to exit from the thread group, release the
// thread group's resources.
if lastExiter {
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index 41259210c..6d737d3e5 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) {
if !trace.IsEnabled() {
return
}
- d := tc.MemoryManager.Executable()
- if d == nil {
+ file := tc.MemoryManager.Executable()
+ if file == nil {
trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
return
}
- defer d.DecRef()
- root := t.fsContext.RootDirectory()
- if root == nil {
- trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
- return
- }
- defer root.DecRef()
- n, _ := d.FullName(root)
- trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
+ defer file.DecRef()
+ trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index de838beef..f9236a842 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -80,6 +81,9 @@ type TaskConfig struct {
// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
AbstractSocketNamespace *AbstractSocketNamespace
+ // MountNamespaceVFS2 is the MountNamespace of the new task.
+ MountNamespaceVFS2 *vfs.MountNamespace
+
// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
RSeqAddr usermem.Addr
@@ -116,28 +120,29 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
parent: cfg.Parent,
children: make(map[*Task]struct{}),
},
- runState: (*runApp)(nil),
- interruptChan: make(chan struct{}, 1),
- signalMask: cfg.SignalMask,
- signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
- tc: *tc,
- fsContext: cfg.FSContext,
- fdTable: cfg.FDTable,
- p: cfg.Kernel.Platform.NewContext(),
- k: cfg.Kernel,
- ptraceTracees: make(map[*Task]struct{}),
- allowedCPUMask: cfg.AllowedCPUMask.Copy(),
- ioUsage: &usage.IO{},
- niceness: cfg.Niceness,
- netns: cfg.NetworkNamespaced,
- utsns: cfg.UTSNamespace,
- ipcns: cfg.IPCNamespace,
- abstractSockets: cfg.AbstractSocketNamespace,
- rseqCPU: -1,
- rseqAddr: cfg.RSeqAddr,
- rseqSignature: cfg.RSeqSignature,
- futexWaiter: futex.NewWaiter(),
- containerID: cfg.ContainerID,
+ runState: (*runApp)(nil),
+ interruptChan: make(chan struct{}, 1),
+ signalMask: cfg.SignalMask,
+ signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
+ tc: *tc,
+ fsContext: cfg.FSContext,
+ fdTable: cfg.FDTable,
+ p: cfg.Kernel.Platform.NewContext(),
+ k: cfg.Kernel,
+ ptraceTracees: make(map[*Task]struct{}),
+ allowedCPUMask: cfg.AllowedCPUMask.Copy(),
+ ioUsage: &usage.IO{},
+ niceness: cfg.Niceness,
+ netns: cfg.NetworkNamespaced,
+ utsns: cfg.UTSNamespace,
+ ipcns: cfg.IPCNamespace,
+ abstractSockets: cfg.AbstractSocketNamespace,
+ mountNamespaceVFS2: cfg.MountNamespaceVFS2,
+ rseqCPU: -1,
+ rseqAddr: cfg.RSeqAddr,
+ rseqSignature: cfg.RSeqSignature,
+ futexWaiter: futex.NewWaiter(),
+ containerID: cfg.ContainerID,
}
t.creds.Store(cfg.Credentials)
t.endStopCond.L = &t.tg.signalHandlers.mu
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 768e958d2..268f62e9d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -256,7 +256,7 @@ type ThreadGroup struct {
tty *TTY
}
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
// thread group leader will send its parent terminationSignal when it exits.
// The new thread group isn't visible to the system until a task has been
// created inside of it by a successful call to TaskSet.NewTask.
@@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() {
for _, it := range its {
it.DestroyTimer()
}
- tg.mounts.DecRef()
+ if tg.mounts != nil {
+ tg.mounts.DecRef()
+ }
}
// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD
index 23790378a..c6aa65f28 100644
--- a/pkg/sentry/loader/BUILD
+++ b/pkg/sentry/loader/BUILD
@@ -33,6 +33,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/limits",
"//pkg/sentry/memmap",
@@ -40,6 +41,7 @@ go_library(
"//pkg/sentry/pgalloc",
"//pkg/sentry/uniqueid",
"//pkg/sentry/usage",
+ "//pkg/sentry/vfs",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index 122ed05c2..616fafa2c 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -27,7 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -97,11 +97,11 @@ type elfInfo struct {
// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
//
// ctx may be nil if f does not need it.
-func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
+func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) {
// Check ident first; it will tell us the endianness of the rest of the
// structs.
var ident [elf.EI_NIDENT]byte
- _, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0)
+ _, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0)
if err != nil {
log.Infof("Error reading ELF ident: %v", err)
// The entire ident array always exists.
@@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
var hdr elf.Header64
hdrBuf := make([]byte, header64Size)
- _, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0)
+ _, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0)
if err != nil {
log.Infof("Error reading ELF header: %v", err)
// The entire header always exists.
@@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
}
phdrBuf := make([]byte, totalPhdrSize)
- _, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
+ _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
if err != nil {
log.Infof("Error reading ELF phdrs: %v", err)
// If phdrs were specified, they should all exist.
@@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) {
// mapSegment maps a phdr into the Task. offset is the offset to apply to
// phdr.Vaddr.
-func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
+func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error {
// We must make a page-aligned mapping.
adjust := usermem.Addr(phdr.Vaddr).PageOffset()
@@ -395,7 +395,7 @@ type loadedELF struct {
//
// Preconditions:
// * f is an ELF file
-func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
+func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) {
first := true
var start, end usermem.Addr
var interpreter string
@@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
}
path := make([]byte, phdr.Filesz)
- _, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off))
+ _, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
if err != nil {
// If an interpreter was specified, it should exist.
ctx.Infof("Error reading PT_INTERP path: %v", err)
@@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
// Preconditions:
// * f is an ELF file
// * f is the first ELF loaded into m
-func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) {
+func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
info, err := parseHeader(ctx, f)
if err != nil {
ctx.Infof("Failed to parse initial ELF: %v", err)
@@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS
//
// Preconditions:
// * f is an ELF file
-func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) {
+func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
info, err := parseHeader(ctx, f)
if err != nil {
if err == syserror.ENOEXEC {
@@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error
// Refresh the traversal limit.
*args.RemainingTraversals = linux.MaxSymlinkTraversals
args.Filename = bin.interpreter
- d, i, err := openPath(ctx, args)
+ intFile, err := openPath(ctx, args)
if err != nil {
ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
return loadedELF{}, nil, err
}
- defer i.DecRef()
- // We don't need the Dirent.
- d.DecRef()
+ defer intFile.DecRef()
- interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin)
+ interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
if err != nil {
ctx.Infof("Error loading interpreter: %v", err)
return loadedELF{}, nil, err
diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go
index 098a45d36..3886b4d33 100644
--- a/pkg/sentry/loader/interpreter.go
+++ b/pkg/sentry/loader/interpreter.go
@@ -19,7 +19,7 @@ import (
"io"
"gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -37,9 +37,9 @@ const (
)
// parseInterpreterScript returns the interpreter path and argv.
-func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) {
+func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) {
line := make([]byte, interpMaxLineLength)
- n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0)
+ n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0)
// Short read is OK.
if err != nil && err != io.ErrUnexpectedEOF {
if err == io.EOF {
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index 9a613d6b7..d6675b8f0 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -20,7 +20,6 @@ import (
"fmt"
"io"
"path"
- "strings"
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -29,8 +28,10 @@ import (
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
@@ -41,16 +42,6 @@ type LoadArgs struct {
// MemoryManager is the memory manager to load the executable into.
MemoryManager *mm.MemoryManager
- // Mounts is the mount namespace in which to look up Filename.
- Mounts *fs.MountNamespace
-
- // Root is the root directory under which to look up Filename.
- Root *fs.Dirent
-
- // WorkingDirectory is the working directory under which to look up
- // Filename.
- WorkingDirectory *fs.Dirent
-
// RemainingTraversals is the maximum number of symlinks to follow to
// resolve Filename. This counter is passed by reference to keep it
// updated throughout the call stack.
@@ -65,7 +56,12 @@ type LoadArgs struct {
// File is an open fs.File object of the executable. If File is not
// nil, then File will be loaded and Filename will be ignored.
- File *fs.File
+ //
+ // The caller is responsible for checking that the user can execute this file.
+ File fsbridge.File
+
+ // Opener is used to open the executable file when 'File' is nil.
+ Opener fsbridge.Lookup
// CloseOnExec indicates that the executable (or one of its parent
// directories) was opened with O_CLOEXEC. If the executable is an
@@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in
// installed in the Task FDTable. The caller takes ownership of both.
//
// args.Filename must be a readable, executable, regular file.
-func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) {
+func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) {
if args.Filename == "" {
ctx.Infof("cannot open empty name")
- return nil, nil, syserror.ENOENT
- }
-
- var d *fs.Dirent
- var err error
- if args.ResolveFinal {
- d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
- } else {
- d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals)
- }
- if err != nil {
- return nil, nil, err
- }
- // Defer a DecRef for the sake of failure cases.
- defer d.DecRef()
-
- if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) {
- return nil, nil, syserror.ELOOP
- }
-
- if err := checkPermission(ctx, d); err != nil {
- return nil, nil, err
- }
-
- // If they claim it's a directory, then make sure.
- //
- // N.B. we reject directories below, but we must first reject
- // non-directories passed as directories.
- if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) {
- return nil, nil, syserror.ENOTDIR
- }
-
- if err := checkIsRegularFile(ctx, d, args.Filename); err != nil {
- return nil, nil, err
- }
-
- f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
- if err != nil {
- return nil, nil, err
- }
- // Defer a DecRef for the sake of failure cases.
- defer f.DecRef()
-
- if err := checkPread(ctx, f, args.Filename); err != nil {
- return nil, nil, err
- }
-
- d.IncRef()
- f.IncRef()
- return d, f, err
-}
-
-// checkFile performs checks on a file to be executed.
-func checkFile(ctx context.Context, f *fs.File, filename string) error {
- if err := checkPermission(ctx, f.Dirent); err != nil {
- return err
- }
-
- if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil {
- return err
+ return nil, syserror.ENOENT
}
- return checkPread(ctx, f, filename)
-}
-
-// checkPermission checks whether the file is readable and executable.
-func checkPermission(ctx context.Context, d *fs.Dirent) error {
- perms := fs.PermMask{
- // TODO(gvisor.dev/issue/160): Linux requires only execute
- // permission, not read. However, our backing filesystems may
- // prevent us from reading the file without read permission.
- //
- // Additionally, a task with a non-readable executable has
- // additional constraints on access via ptrace and procfs.
- Read: true,
- Execute: true,
+ // TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+ // not read. However, our backing filesystems may prevent us from reading
+ // the file without read permission. Additionally, a task with a
+ // non-readable executable has additional constraints on access via
+ // ptrace and procfs.
+ opts := vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ FileExec: true,
}
- return d.Inode.CheckPermission(ctx, perms)
+ return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal)
}
// checkIsRegularFile prevents us from trying to execute a directory, pipe, etc.
-func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error {
- attr := d.Inode.StableAttr
- if !fs.IsRegular(attr) {
- ctx.Infof("%s is not regular: %v", filename, attr)
- return syserror.EACCES
+func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error {
+ t, err := file.Type(ctx)
+ if err != nil {
+ return err
}
- return nil
-}
-
-// checkPread checks whether we can read the file at arbitrary offsets.
-func checkPread(ctx context.Context, f *fs.File, filename string) error {
- if !f.Flags().Pread {
- ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags())
+ if t != linux.ModeRegular {
+ ctx.Infof("%q is not a regular file: %v", filename, t)
return syserror.EACCES
}
return nil
@@ -224,8 +149,10 @@ const (
maxLoaderAttempts = 6
)
-// loadExecutable loads an executable that is pointed to by args.File. If nil,
-// the path args.Filename is resolved and loaded. If the executable is an
+// loadExecutable loads an executable that is pointed to by args.File. The
+// caller is responsible for checking that the user can execute this file.
+// If nil, the path args.Filename is resolved and loaded (check that the user
+// can execute this file is done here in this case). If the executable is an
// interpreter script rather than an ELF, the binary of the corresponding
// interpreter will be loaded.
//
@@ -234,37 +161,27 @@ const (
// * arch.Context matching the binary arch
// * fs.Dirent of the binary file
// * Possibly updated args.Argv
-func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) {
for i := 0; i < maxLoaderAttempts; i++ {
- var (
- d *fs.Dirent
- err error
- )
if args.File == nil {
- d, args.File, err = openPath(ctx, args)
- // We will return d in the successful case, but defer a DecRef for the
- // sake of intermediate loops and failure cases.
- if d != nil {
- defer d.DecRef()
- }
- if args.File != nil {
- defer args.File.DecRef()
+ var err error
+ args.File, err = openPath(ctx, args)
+ if err != nil {
+ ctx.Infof("Error opening %s: %v", args.Filename, err)
+ return loadedELF{}, nil, nil, nil, err
}
+ // Ensure file is release in case the code loops or errors out.
+ defer args.File.DecRef()
} else {
- d = args.File.Dirent
- d.IncRef()
- defer d.DecRef()
- err = checkFile(ctx, args.File, args.Filename)
- }
- if err != nil {
- ctx.Infof("Error opening %s: %v", args.Filename, err)
- return loadedELF{}, nil, nil, nil, err
+ if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
+ return loadedELF{}, nil, nil, nil, err
+ }
}
// Check the header. Is this an ELF or interpreter script?
var hdr [4]uint8
// N.B. We assume that reading from a regular file cannot block.
- _, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0)
+ _, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0)
// Allow unexpected EOF, as a valid executable could be only three bytes
// (e.g., #!a).
if err != nil && err != io.ErrUnexpectedEOF {
@@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
ctx.Infof("Error loading ELF: %v", err)
return loadedELF{}, nil, nil, nil, err
}
- // An ELF is always terminal. Hold on to d.
- d.IncRef()
- return loaded, ac, d, args.Argv, err
+ // An ELF is always terminal. Hold on to file.
+ args.File.IncRef()
+ return loaded, ac, args.File, args.Argv, err
+
case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
if args.CloseOnExec {
return loadedELF{}, nil, nil, nil, syserror.ENOENT
@@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
}
// Refresh the traversal limit for the interpreter.
*args.RemainingTraversals = linux.MaxSymlinkTraversals
+
default:
ctx.Infof("Unknown magic: %v", hdr)
return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
@@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context
// * Load is called on the Task goroutine.
func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
// Load the executable itself.
- loaded, ac, d, newArgv, err := loadExecutable(ctx, args)
+ loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
if err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
}
- defer d.DecRef()
+ defer file.DecRef()
// Load the VDSO.
vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
@@ -390,7 +309,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V
m.SetEnvvStart(sl.EnvvStart)
m.SetEnvvEnd(sl.EnvvEnd)
m.SetAuxv(auxv)
- m.SetExecutable(d)
+ m.SetExecutable(file)
ac.SetIP(uintptr(loaded.entry))
ac.SetStack(uintptr(stack.Bottom))
diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go
index 52f446ed7..161b28c2c 100644
--- a/pkg/sentry/loader/vdso.go
+++ b/pkg/sentry/loader/vdso.go
@@ -27,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -69,6 +70,8 @@ type byteReader struct {
var _ fs.FileOperations = (*byteReader)(nil)
// newByteReaderFile creates a fake file to read data from.
+//
+// TODO(gvisor.dev/issue/1623): Convert to VFS2.
func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
// Create a fake inode.
inode := fs.NewInode(
@@ -123,7 +126,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq
// * PT_LOAD segments don't extend beyond the end of the file.
//
// ctx may be nil if f does not need it.
-func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
+func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) {
info, err := parseHeader(ctx, f)
if err != nil {
log.Infof("Unable to parse VDSO header: %v", err)
@@ -221,7 +224,7 @@ type VDSO struct {
// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
// param page for updating by the kernel.
func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
- vdsoFile := newByteReaderFile(ctx, vdsoBin)
+ vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin))
// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
// nil context can be passed.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index e5729ced5..73591dab7 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -105,8 +105,8 @@ go_library(
"//pkg/safecopy",
"//pkg/safemem",
"//pkg/sentry/arch",
- "//pkg/sentry/fs",
"//pkg/sentry/fs/proc/seqfile",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/futex",
"//pkg/sentry/kernel/shm",
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index f550acae0..6a49334f4 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -16,7 +16,7 @@ package mm
import (
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
//
// An additional reference will be taken in the case of a non-nil executable,
// which must be released by the caller.
-func (mm *MemoryManager) Executable() *fs.Dirent {
+func (mm *MemoryManager) Executable() fsbridge.File {
mm.metadataMu.Lock()
defer mm.metadataMu.Unlock()
@@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent {
// SetExecutable sets the executable.
//
// This takes a reference on d.
-func (mm *MemoryManager) SetExecutable(d *fs.Dirent) {
+func (mm *MemoryManager) SetExecutable(file fsbridge.File) {
mm.metadataMu.Lock()
// Grab a new reference.
- d.IncRef()
+ file.IncRef()
// Set the executable.
orig := mm.executable
- mm.executable = d
+ mm.executable = file
mm.metadataMu.Unlock()
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 09e582dd3..637383c7a 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -37,7 +37,7 @@ package mm
import (
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -215,7 +215,7 @@ type MemoryManager struct {
// is not nil, it holds a reference on the Dirent.
//
// executable is protected by metadataMu.
- executable *fs.Dirent
+ executable fsbridge.File
// dumpability describes if and how this MemoryManager may be dumped to
// userspace.
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 971eed7fa..4f2406ce3 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -7,7 +7,7 @@ go_template(
name = "generic_walker",
srcs = select_arch(
amd64 = ["walker_amd64.go"],
- arm64 = ["walker_amd64.go"],
+ arm64 = ["walker_arm64.go"],
),
opt_types = [
"Visitor",
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index d322e4144..2ec11f6ac 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -591,10 +591,9 @@ func parseTarget(optVal []byte) (iptables.Target, error) {
if standardTarget.Verdict < 0 {
// A Verdict < 0 indicates a non-jump verdict.
return translateToStandardTarget(standardTarget.Verdict)
- } else {
- // A verdict >= 0 indicates a jump.
- return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
}
+ // A verdict >= 0 indicates a jump.
+ return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil
case errorTargetName:
// Error target.
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index a796b2396..46cb2a1cc 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -141,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string {
}
func fd(t *kernel.Task, fd int32) string {
+ if kernel.VFS2Enabled {
+ return fdVFS2(t, fd)
+ }
+
root := t.FSContext().RootDirectory()
if root != nil {
defer root.DecRef()
@@ -169,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string {
return fmt.Sprintf("%#x %s", fd, name)
}
+func fdVFS2(t *kernel.Task, fd int32) string {
+ root := t.FSContext().RootDirectoryVFS2()
+ defer root.DecRef()
+
+ vfsObj := root.Mount().Filesystem().VirtualFilesystem()
+ if fd == linux.AT_FDCWD {
+ wd := t.FSContext().WorkingDirectoryVFS2()
+ defer wd.DecRef()
+
+ name, _ := vfsObj.PathnameWithDeleted(t, root, wd)
+ return fmt.Sprintf("AT_FDCWD %s", name)
+ }
+
+ file := t.GetFileVFS2(fd)
+ if file == nil {
+ // Cast FD to uint64 to avoid printing negative hex.
+ return fmt.Sprintf("%#x (bad FD)", uint64(fd))
+ }
+ defer file.DecRef()
+
+ name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry())
+ return fmt.Sprintf("%#x %s", fd, name)
+}
+
func fdpair(t *kernel.Task, addr usermem.Addr) string {
var fds [2]int32
_, err := t.CopyIn(addr, &fds)
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index be16ee686..c7883e68e 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -42,8 +42,6 @@ go_library(
"sys_socket.go",
"sys_splice.go",
"sys_stat.go",
- "sys_stat_amd64.go",
- "sys_stat_arm64.go",
"sys_sync.go",
"sys_sysinfo.go",
"sys_syslog.go",
@@ -74,6 +72,7 @@ go_library(
"//pkg/sentry/fs/lock",
"//pkg/sentry/fs/timerfd",
"//pkg/sentry/fs/tmpfs",
+ "//pkg/sentry/fsbridge",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/epoll",
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 5f11b496c..fbef5b376 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -83,8 +83,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
mask = waiter.EventMaskFromLinux(e.Events)
- data[0] = e.Fd
- data[1] = e.Data
+ data = e.Data
}
// Perform the requested operations.
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 98db32d77..9c6728530 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/mm"
@@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
// Set the underlying executable.
- t.MemoryManager().SetExecutable(file.Dirent)
+ t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file))
case linux.PR_SET_MM_AUXV,
linux.PR_SET_MM_START_CODE,
diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go
index c841abccb..8b66a9006 100644
--- a/pkg/sentry/syscalls/linux/sys_stat.go
+++ b/pkg/sentry/syscalls/linux/sys_stat.go
@@ -23,6 +23,24 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
+ return linux.Stat{
+ Dev: sattr.DeviceID,
+ Ino: sattr.InodeID,
+ Nlink: uattr.Links,
+ Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()),
+ UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()),
+ GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()),
+ Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)),
+ Size: uattr.Size,
+ Blksize: sattr.BlockSize,
+ Blocks: uattr.Usage / 512,
+ ATime: uattr.AccessTime.Timespec(),
+ MTime: uattr.ModificationTime.Timespec(),
+ CTime: uattr.StatusChangeTime.Timespec(),
+ }
+}
+
// Stat implements linux syscall stat(2).
func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
@@ -112,7 +130,9 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err
if err != nil {
return err
}
- return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr)
+ s := statFromAttrs(t, d.Inode.StableAttr, uattr)
+ _, err = s.CopyOut(t, statAddr)
+ return err
}
// fstat implements fstat for the given *fs.File.
@@ -121,7 +141,9 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error {
if err != nil {
return err
}
- return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr)
+ s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr)
+ _, err = s.CopyOut(t, statAddr)
+ return err
}
// Statx implements linux syscall statx(2).
diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go
deleted file mode 100644
index 75a567bd4..000000000
--- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//+build amd64
-
-package linux
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
-)
-
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
- b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
- // Dev (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
- // Ino (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
- // Nlink (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links)
- // Mode (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
- // UID (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
- // GID (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
- // Padding (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, 0)
- // Rdev (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
- // Size (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
- // Blksize (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize))
- // Blocks (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
- // ATime
- atime := uattr.AccessTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
- // MTime
- mtime := uattr.ModificationTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
- // CTime
- ctime := uattr.StatusChangeTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
- _, err := t.CopyOutBytes(dst, b)
- return err
-}
diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go
deleted file mode 100644
index 80c98d05c..000000000
--- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//+build arm64
-
-package linux
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/sentry/fs"
- "gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/usermem"
-)
-
-// copyOutStat copies the attributes (sattr, uattr) to the struct stat at
-// address dst in t's address space. It encodes the stat struct to bytes
-// manually, as stat() is a very common syscall for many applications, and
-// t.CopyObjectOut has noticeable performance impact due to its many slice
-// allocations and use of reflection.
-func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error {
- b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0]
-
- // Dev (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID))
- // Ino (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID))
- // Mode (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode()))
- // Nlink (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links))
- // UID (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
- // GID (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()))
- // Rdev (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)))
- // Padding (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, 0)
- // Size (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size))
- // Blksize (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize))
- // Padding (uint32)
- b = binary.AppendUint32(b, usermem.ByteOrder, 0)
- // Blocks (uint64)
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512))
-
- // ATime
- atime := uattr.AccessTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec))
-
- // MTime
- mtime := uattr.ModificationTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec))
-
- // CTime
- ctime := uattr.StatusChangeTime.Timespec()
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec))
- b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec))
-
- _, err := t.CopyOutBytes(dst, b)
- return err
-}
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 0c9e2255d..00915fdde 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
"gvisor.dev/gvisor/pkg/sentry/loader"
@@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
defer root.DecRef()
var wd *fs.Dirent
- var executable *fs.File
+ var executable fsbridge.File
var closeOnExec bool
if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
// Even if the pathname is absolute, we may still need the wd
@@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
closeOnExec = fdFlags.CloseOnExec
if atEmptyPath && len(pathname) == 0 {
- executable = f
+ // TODO(gvisor.dev/issue/160): Linux requires only execute permission,
+ // not read. However, our backing filesystems may prevent us from reading
+ // the file without read permission. Additionally, a task with a
+ // non-readable executable has additional constraints on access via
+ // ptrace and procfs.
+ if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
+ return 0, nil, err
+ }
+ executable = fsbridge.NewFSFile(f)
} else {
wd = f.Dirent
wd.IncRef()
@@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user
// Load the new TaskContext.
remainingTraversals := uint(linux.MaxSymlinkTraversals)
loadArgs := loader.LoadArgs{
- Mounts: t.MountNamespace(),
- Root: root,
- WorkingDirectory: wd,
+ Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
RemainingTraversals: &remainingTraversals,
ResolveFinal: resolveFinal,
Filename: pathname,
diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
index c134714ee..e0ac32b33 100644
--- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
+++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go
@@ -22,4 +22,110 @@ import (
// Override syscall table to add syscalls implementations from this package.
func Override(table map[uintptr]kernel.Syscall) {
table[0] = syscalls.Supported("read", Read)
+
+ // Remove syscalls that haven't been converted yet. It's better to get ENOSYS
+ // rather than a SIGSEGV deep in the stack.
+ delete(table, 1) // write
+ delete(table, 2) // open
+ delete(table, 3) // close
+ delete(table, 4) // stat
+ delete(table, 5) // fstat
+ delete(table, 6) // lstat
+ delete(table, 7) // poll
+ delete(table, 8) // lseek
+ delete(table, 9) // mmap
+ delete(table, 16) // ioctl
+ delete(table, 17) // pread64
+ delete(table, 18) // pwrite64
+ delete(table, 19) // readv
+ delete(table, 20) // writev
+ delete(table, 21) // access
+ delete(table, 22) // pipe
+ delete(table, 32) // dup
+ delete(table, 33) // dup2
+ delete(table, 40) // sendfile
+ delete(table, 59) // execve
+ delete(table, 72) // fcntl
+ delete(table, 73) // flock
+ delete(table, 74) // fsync
+ delete(table, 75) // fdatasync
+ delete(table, 76) // truncate
+ delete(table, 77) // ftruncate
+ delete(table, 78) // getdents
+ delete(table, 79) // getcwd
+ delete(table, 80) // chdir
+ delete(table, 81) // fchdir
+ delete(table, 82) // rename
+ delete(table, 83) // mkdir
+ delete(table, 84) // rmdir
+ delete(table, 85) // creat
+ delete(table, 86) // link
+ delete(table, 87) // unlink
+ delete(table, 88) // symlink
+ delete(table, 89) // readlink
+ delete(table, 90) // chmod
+ delete(table, 91) // fchmod
+ delete(table, 92) // chown
+ delete(table, 93) // fchown
+ delete(table, 94) // lchown
+ delete(table, 133) // mknod
+ delete(table, 137) // statfs
+ delete(table, 138) // fstatfs
+ delete(table, 161) // chroot
+ delete(table, 162) // sync
+ delete(table, 165) // mount
+ delete(table, 166) // umount2
+ delete(table, 172) // iopl
+ delete(table, 173) // ioperm
+ delete(table, 187) // readahead
+ delete(table, 188) // setxattr
+ delete(table, 189) // lsetxattr
+ delete(table, 190) // fsetxattr
+ delete(table, 191) // getxattr
+ delete(table, 192) // lgetxattr
+ delete(table, 193) // fgetxattr
+ delete(table, 206) // io_setup
+ delete(table, 207) // io_destroy
+ delete(table, 208) // io_getevents
+ delete(table, 209) // io_submit
+ delete(table, 210) // io_cancel
+ delete(table, 213) // epoll_create
+ delete(table, 214) // epoll_ctl_old
+ delete(table, 215) // epoll_wait_old
+ delete(table, 216) // remap_file_pages
+ delete(table, 217) // getdents64
+ delete(table, 232) // epoll_wait
+ delete(table, 233) // epoll_ctl
+ delete(table, 253) // inotify_init
+ delete(table, 254) // inotify_add_watch
+ delete(table, 255) // inotify_rm_watch
+ delete(table, 257) // openat
+ delete(table, 258) // mkdirat
+ delete(table, 259) // mknodat
+ delete(table, 260) // fchownat
+ delete(table, 261) // futimesat
+ delete(table, 262) // fstatat
+ delete(table, 263) // unlinkat
+ delete(table, 264) // renameat
+ delete(table, 265) // linkat
+ delete(table, 266) // symlinkat
+ delete(table, 267) // readlinkat
+ delete(table, 268) // fchmodat
+ delete(table, 269) // faccessat
+ delete(table, 270) // pselect
+ delete(table, 271) // ppoll
+ delete(table, 285) // fallocate
+ delete(table, 291) // epoll_create1
+ delete(table, 292) // dup3
+ delete(table, 293) // pipe2
+ delete(table, 294) // inotify_init1
+ delete(table, 295) // preadv
+ delete(table, 296) // pwritev
+ delete(table, 306) // syncfs
+ delete(table, 316) // renameat2
+ delete(table, 319) // memfd_create
+ delete(table, 322) // execveat
+ delete(table, 327) // preadv2
+ delete(table, 328) // pwritev2
+ delete(table, 332) // statx
}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 14b39eb9d..0b4f18ab5 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -43,6 +43,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
+ "//pkg/log",
"//pkg/sentry/arch",
"//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index d97362b9a..82781e6d3 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -29,9 +29,10 @@ const (
CtxRoot
)
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
if v := ctx.Value(CtxMountNamespace); v != nil {
return v.(*MountNamespace)
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 486a76475..35b208721 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -71,6 +71,8 @@ import (
// lifetime. Dentry reference counts only indicate the extent to which VFS
// requires Dentries to exist; Filesystems may elect to cache or discard
// Dentries with zero references.
+//
+// +stateify savable
type Dentry struct {
// parent is this Dentry's parent in this Filesystem. If this Dentry is
// independent, parent is nil.
@@ -89,7 +91,7 @@ type Dentry struct {
children map[string]*Dentry
// mu synchronizes disowning and mounting over this Dentry.
- mu sync.Mutex
+ mu sync.Mutex `state:"nosave"`
// impl is the DentryImpl associated with this Dentry. impl is immutable.
// This should be the last field in Dentry.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 3af2aa58d..bda5576fa 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -56,6 +56,7 @@ type Device interface {
Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
}
+// +stateify savable
type registeredDevice struct {
dev Device
opts RegisterDeviceOptions
@@ -63,6 +64,8 @@ type registeredDevice struct {
// RegisterDeviceOptions contains options to
// VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
type RegisterDeviceOptions struct {
// GroupName is the name shown for this device registration in
// /proc/devices. If GroupName is empty, this registration will not be
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 7c83f9a5a..eed41139b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -85,8 +85,8 @@ type epollInterest struct {
ready bool
epollInterestEntry
- // userData is the epoll_data_t associated with this epollInterest.
- // userData is protected by epoll.mu.
+ // userData is the struct epoll_event::data associated with this
+ // epollInterest. userData is protected by epoll.mu.
userData [2]int32
}
@@ -157,7 +157,7 @@ func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (
// AddInterest implements the semantics of EPOLL_CTL_ADD.
//
// Preconditions: A reference must be held on file.
-func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
// Check for cyclic polling if necessary.
subep, _ := file.impl.(*EpollInstance)
if subep != nil {
@@ -183,12 +183,12 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
}
// Register interest in file.
- mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
epi := &epollInterest{
epoll: ep,
key: key,
mask: mask,
- userData: userData,
+ userData: event.Data,
}
ep.interest[key] = epi
wmask := waiter.EventMaskFromLinux(mask)
@@ -236,7 +236,7 @@ func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursi
// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
//
// Preconditions: A reference must be held on file.
-func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
ep.interestMu.Lock()
defer ep.interestMu.Unlock()
@@ -250,13 +250,13 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask u
}
// Update epi for the next call to ep.ReadEvents().
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
ep.mu.Lock()
epi.mask = mask
- epi.userData = userData
+ epi.userData = event.Data
ep.mu.Unlock()
// Re-register with the new mask.
- mask |= linux.EPOLLERR | linux.EPOLLRDHUP
file.EventUnregister(&epi.waiter)
wmask := waiter.EventMaskFromLinux(mask)
file.EventRegister(&epi.waiter, wmask)
@@ -363,8 +363,7 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
// Report ievents.
events[i] = linux.EpollEvent{
Events: ievents.ToLinux(),
- Fd: epi.userData[0],
- Data: epi.userData[1],
+ Data: epi.userData,
}
i++
if i == len(events) {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5bac660c7..9a1ad630c 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -435,11 +435,11 @@ type Dirent struct {
// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
type IterDirentsCallback interface {
- // Handle handles the given iterated Dirent. It returns true if iteration
- // should continue, and false if FileDescriptionImpl.IterDirents should
- // terminate now and restart with the same Dirent the next time it is
- // called.
- Handle(dirent Dirent) bool
+ // Handle handles the given iterated Dirent. If Handle returns a non-nil
+ // error, FileDescriptionImpl.IterDirents must stop iteration and return
+ // the error; the next call to FileDescriptionImpl.IterDirents should
+ // restart with the same Dirent.
+ Handle(dirent Dirent) error
}
// OnClose is called when a file descriptor representing the FileDescription is
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 8fa26418e..3a75d4d62 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
- vfsObj := New() // vfs.New()
+ vfsObj := &VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
defer fd.DecRef()
@@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) {
func TestWritable(t *testing.T) {
ctx := contexttest.Context(t)
- vfsObj := New() // vfs.New()
+ vfsObj := &VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
defer fd.DecRef()
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index a06a6caf3..556976d0b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -29,6 +29,8 @@ import (
// Filesystem methods require that a reference is held.
//
// Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
type Filesystem struct {
// refs is the reference count. refs is accessed using atomic memory
// operations.
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c58b70728..bb9cada81 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -44,6 +44,7 @@ type GetFilesystemOptions struct {
InternalData interface{}
}
+// +stateify savable
type registeredFilesystemType struct {
fsType FilesystemType
opts RegisterFilesystemTypeOptions
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1fbb420f9..9912df799 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -38,6 +38,8 @@ import (
//
// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
// between struct mount and struct vfsmount.)
+//
+// +stateify savable
type Mount struct {
// vfs, fs, and root are immutable. References are held on fs and root.
//
@@ -85,6 +87,8 @@ type Mount struct {
// MountNamespace methods require that a reference is held.
//
// MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
type MountNamespace struct {
// root is the MountNamespace's root mount. root is immutable.
root *Mount
@@ -114,6 +118,7 @@ type MountNamespace struct {
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
+ ctx.Warningf("Unknown filesystem: %s", fsTypeName)
return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -231,9 +236,12 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
return syserror.EINVAL
}
vfs.mountMu.Lock()
- if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
- vfs.mountMu.Unlock()
- return syserror.EINVAL
+ if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+ defer mntns.DecRef()
+ if mntns != vd.mount.ns {
+ vfs.mountMu.Unlock()
+ return syserror.EINVAL
+ }
}
// TODO(jamieliu): Linux special-cases umount of the caller's root, which
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bd90d36c4..1fe766a44 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -64,6 +64,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) {
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
type mountTable struct {
// mountTable is implemented as a seqcount-protected hash table that
// resolves collisions with linear probing, featuring Robin Hood insertion
@@ -75,8 +77,8 @@ type mountTable struct {
// intrinsics and inline assembly, limiting the performance of this
// approach.)
- seq sync.SeqCount
- seed uint32 // for hashing keys
+ seq sync.SeqCount `state:"nosave"`
+ seed uint32 // for hashing keys
// size holds both length (number of elements) and capacity (number of
// slots): capacity is stored as its base-2 log (referred to as order) in
@@ -89,7 +91,7 @@ type mountTable struct {
// length and cap in separate uint32s) for ~free.
size uint64
- slots unsafe.Pointer // []mountSlot; never nil after Init
+ slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}
type mountSlot struct {
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index b7774bf28..6af7fdac1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -61,7 +61,7 @@ type MountOptions struct {
type OpenOptions struct {
// Flags contains access mode and flags as specified for open(2).
//
- // FilesystemImpls is reponsible for implementing the following flags:
+ // FilesystemImpls are responsible for implementing the following flags:
// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
@@ -72,6 +72,11 @@ type OpenOptions struct {
// If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
// created file.
Mode linux.FileMode
+
+ // FileExec is set when the file is being opened to be executed.
+ // VirtualFilesystem.OpenAt() checks that the caller has execute permissions
+ // on the file, and that the file is a regular file.
+ FileExec bool
}
// ReadOptions contains options to FileDescription.PRead(),
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f664581f4..8e250998a 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -103,17 +103,22 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
// AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
//
// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
-func AccessTypesForOpenFlags(flags uint32) AccessTypes {
- switch flags & linux.O_ACCMODE {
+func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes {
+ ats := AccessTypes(0)
+ if opts.FileExec {
+ ats |= MayExec
+ }
+
+ switch opts.Flags & linux.O_ACCMODE {
case linux.O_RDONLY:
- if flags&linux.O_TRUNC != 0 {
- return MayRead | MayWrite
+ if opts.Flags&linux.O_TRUNC != 0 {
+ return ats | MayRead | MayWrite
}
- return MayRead
+ return ats | MayRead
case linux.O_WRONLY:
- return MayWrite
+ return ats | MayWrite
default:
- return MayRead | MayWrite
+ return ats | MayRead | MayWrite
}
}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 908c69f91..8f29031b2 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -46,11 +46,13 @@ import (
//
// There is no analogue to the VirtualFilesystem type in Linux, as the
// equivalent state in Linux is global.
+//
+// +stateify savable
type VirtualFilesystem struct {
// mountMu serializes mount mutations.
//
// mountMu is analogous to Linux's namespace_sem.
- mountMu sync.Mutex
+ mountMu sync.Mutex `state:"nosave"`
// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
// are uniquely namespaced, including mount parent in the key correctly
@@ -89,44 +91,42 @@ type VirtualFilesystem struct {
// devices contains all registered Devices. devices is protected by
// devicesMu.
- devicesMu sync.RWMutex
+ devicesMu sync.RWMutex `state:"nosave"`
devices map[devTuple]*registeredDevice
// anonBlockDevMinor contains all allocated anonymous block device minor
// numbers. anonBlockDevMinorNext is a lower bound for the smallest
// unallocated anonymous block device number. anonBlockDevMinorNext and
// anonBlockDevMinor are protected by anonBlockDevMinorMu.
- anonBlockDevMinorMu sync.Mutex
+ anonBlockDevMinorMu sync.Mutex `state:"nosave"`
anonBlockDevMinorNext uint32
anonBlockDevMinor map[uint32]struct{}
// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
// fsTypesMu.
- fsTypesMu sync.RWMutex
+ fsTypesMu sync.RWMutex `state:"nosave"`
fsTypes map[string]*registeredFilesystemType
// filesystems contains all Filesystems. filesystems is protected by
// filesystemsMu.
- filesystemsMu sync.Mutex
+ filesystemsMu sync.Mutex `state:"nosave"`
filesystems map[*Filesystem]struct{}
}
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
- vfs := &VirtualFilesystem{
- mountpoints: make(map[*Dentry]map[*Mount]struct{}),
- devices: make(map[devTuple]*registeredDevice),
- anonBlockDevMinorNext: 1,
- anonBlockDevMinor: make(map[uint32]struct{}),
- fsTypes: make(map[string]*registeredFilesystemType),
- filesystems: make(map[*Filesystem]struct{}),
- }
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+ vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+ vfs.devices = make(map[devTuple]*registeredDevice)
+ vfs.anonBlockDevMinorNext = 1
+ vfs.anonBlockDevMinor = make(map[uint32]struct{})
+ vfs.fsTypes = make(map[string]*registeredFilesystemType)
+ vfs.filesystems = make(map[*Filesystem]struct{})
vfs.mounts.Init()
// Construct vfs.anonMount.
anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
if err != nil {
- panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+ return err
}
anonfs := anonFilesystem{
devMinor: anonfsDevMinor,
@@ -137,8 +137,7 @@ func New() *VirtualFilesystem {
fs: &anonfs.vfsfs,
refs: 1,
}
-
- return vfs
+ return nil
}
// PathOperation specifies the path operated on by a VFS method.
@@ -379,6 +378,26 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(rp)
+
+ // TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
+ // to FileDescription.Stat().
+ if opts.FileExec {
+ // Only a regular file can be executed.
+ stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
+ if err != nil {
+ return nil, err
+ }
+ if stat.Mask&linux.STATX_TYPE != 0 {
+ // This shouldn't happen, but if type can't be retrieved, file can't
+ // be executed.
+ return nil, syserror.EACCES
+ }
+ if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
+ ctx.Infof("%q is not a regular file: %v", pop.Path, t)
+ return nil, syserror.EACCES
+ }
+ }
+
return fd, nil
}
if !rp.handleError(err) {
@@ -724,6 +743,8 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
// VirtualDentry methods require that a reference is held on the VirtualDentry.
//
// VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
type VirtualDentry struct {
mount *Mount
dentry *Dentry