From 960f6a975b7e44c0efe8fd38c66b02017c4fe137 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 6 Mar 2020 12:58:45 -0800 Subject: Add plumbing for importing fds in VFS2, along with non-socket, non-TTY impl. In VFS2, imported file descriptors are stored in a kernfs-based filesystem. Upon calling ImportFD, the host fd can be accessed in two ways: 1. a FileDescription that can be added to the FDTable, and 2. a Dentry in the host.filesystem mount, which we will want to access through magic symlinks in /proc/[pid]/fd/. An implementation of the kernfs.Inode interface stores a unique host fd. This inode can be inserted into file descriptions as well as dentries. This change also plumbs in three FileDescriptionImpls corresponding to fds for sockets, TTYs, and other files (only the latter is implemented here). These implementations will mostly make corresponding syscalls to the host. Where possible, the logic is ported over from pkg/sentry/fs/host. Updates #1672 PiperOrigin-RevId: 299417263 --- pkg/sentry/fsimpl/host/BUILD | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 pkg/sentry/fsimpl/host/BUILD (limited to 'pkg/sentry/fsimpl/host/BUILD') diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD new file mode 100644 index 000000000..731f192b3 --- /dev/null +++ b/pkg/sentry/fsimpl/host/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "host", + srcs = [ + "default_file.go", + "host.go", + "util.go", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/safemem", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", + ], +) -- cgit v1.2.3 From 5e413cad10d2358a21dd08216953faee70e62a0b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 14 Mar 2020 07:13:15 -0700 Subject: Plumb VFS2 imported fds into virtual filesystem. - When setting up the virtual filesystem, mount a host.filesystem to contain all files that need to be imported. - Make read/preadv syscalls to the host in cases where preadv2 may not be supported yet (likewise for writing). - Make save/restore functions in kernel/kernel.go return early if vfs2 is enabled. PiperOrigin-RevId: 300922353 --- pkg/abi/linux/file.go | 3 + pkg/sentry/fs/host/control.go | 2 + pkg/sentry/fsimpl/host/BUILD | 2 + pkg/sentry/fsimpl/host/default_file.go | 45 +++++++----- pkg/sentry/fsimpl/host/host.go | 124 ++++++++++++++++++++++++++++++--- pkg/sentry/fsimpl/host/util.go | 28 ++------ pkg/sentry/kernel/kernel.go | 40 +++++++---- pkg/sentry/syscalls/linux/sys_stat.go | 5 +- pkg/sentry/syscalls/linux/vfs2/stat.go | 6 +- runsc/boot/filter/config.go | 1 + test/syscalls/linux/stat.cc | 60 ++++++++++++++-- 11 files changed, 246 insertions(+), 70 deletions(-) (limited to 'pkg/sentry/fsimpl/host/BUILD') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index e229ac21c..dbe58acbe 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -266,6 +266,9 @@ type Statx struct { DevMinor uint32 } +// SizeOfStatx is the size of a Statx struct. +var SizeOfStatx = binary.Size(Statx{}) + // FileMode represents a mode_t. type FileMode uint16 diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 1658979fc..cd84e1337 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -32,6 +32,8 @@ func newSCMRights(fds []int) control.SCMRights { } // Files implements control.SCMRights.Files. +// +// TODO(gvisor.dev/issue/2017): Port to VFS2. func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) { n := max var trunc bool diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 731f192b3..5d67f88e3 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -9,9 +9,11 @@ go_library( "host.go", "util.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fd", "//pkg/log", "//pkg/refs", "//pkg/safemem", diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go index 172cdb161..98682ba5e 100644 --- a/pkg/sentry/fsimpl/host/default_file.go +++ b/pkg/sentry/fsimpl/host/default_file.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -64,9 +65,7 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } - f.mu.Lock() n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) - f.mu.Unlock() if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read // rather than retrying until complete. @@ -86,16 +85,22 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v return n, err } -func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + if flags != 0 { return 0, syserror.EOPNOTSUPP } - reader := safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv2(fd, srcs, offset, flags) - return int64(n), err - }, + var reader safemem.Reader + if offset == -1 { + reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} + } else { + reader = safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv(hostFD, srcs, offset) + return int64(n), err + }, + } } n, err := dst.CopyOutFrom(ctx, reader) return int64(n), err @@ -120,9 +125,7 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } - f.mu.Lock() n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) - f.mu.Unlock() if isBlockError(err) { err = syserror.ErrWouldBlock } @@ -137,16 +140,22 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts return n, err } -func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { +func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. + if flags != 0 { return 0, syserror.EOPNOTSUPP } - writer := safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev2(fd, srcs, offset, flags) - return int64(n), err - }, + var writer safemem.Writer + if offset == -1 { + writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} + } else { + writer = safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev(hostFD, srcs, offset) + return int64(n), err + }, + } } n, err := src.CopyInTo(ctx, writer) return int64(n), err diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c205e6a0b..0be812d13 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -38,10 +38,19 @@ type filesystem struct { kernfs.Filesystem } +// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD. +func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { + fs := &filesystem{} + fs.Init(vfsObj) + vfsfs := fs.VFSFilesystem() + // NewDisconnectedMount will take an additional reference on vfsfs. + defer vfsfs.DecRef() + return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{}) +} + // ImportFD sets up and returns a vfs.FileDescription from a donated fd. func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { - // Must be importing to a mount of host.filesystem. - fs, ok := mnt.Filesystem().Impl().(*filesystem) + fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } @@ -54,8 +63,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() - // Pipes, character devices, and sockets can return EWOULDBLOCK for - // operations that would block. + // Pipes, character devices, and sockets. isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK i := &inode{ @@ -143,11 +151,109 @@ func (i *inode) Mode() linux.FileMode { // Stat implements kernfs.Inode. func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + if opts.Mask&linux.STATX__RESERVED != 0 { + return linux.Statx{}, syserror.EINVAL + } + if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { + return linux.Statx{}, syserror.EINVAL + } + + // Limit our host call only to known flags. + mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t - if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil { + err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + if err == syserror.ENOSYS { + return i.fstat(opts) + } else if err != nil { + return linux.Statx{}, err + } + + ls := linux.Statx{Mask: mask} + // Unconditionally fill blksize, attributes, and device numbers, as indicated + // by /include/uapi/linux/stat.h. + // + // RdevMajor/RdevMinor are left as zero, so as not to expose host device + // numbers. + // + // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined + // device numbers. If we use the device number from the host, it may collide + // with another sentry-internal device number. We handle device/inode + // numbers without relying on the host to prevent collisions. + ls.Blksize = s.Blksize + ls.Attributes = s.Attributes + ls.AttributesMask = s.Attributes_mask + + if mask|linux.STATX_TYPE != 0 { + ls.Mode |= s.Mode & linux.S_IFMT + } + if mask|linux.STATX_MODE != 0 { + ls.Mode |= s.Mode &^ linux.S_IFMT + } + if mask|linux.STATX_NLINK != 0 { + ls.Nlink = s.Nlink + } + if mask|linux.STATX_ATIME != 0 { + ls.Atime = unixToLinuxStatxTimestamp(s.Atime) + } + if mask|linux.STATX_BTIME != 0 { + ls.Btime = unixToLinuxStatxTimestamp(s.Btime) + } + if mask|linux.STATX_CTIME != 0 { + ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) + } + if mask|linux.STATX_MTIME != 0 { + ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) + } + if mask|linux.STATX_SIZE != 0 { + ls.Size = s.Size + } + if mask|linux.STATX_BLOCKS != 0 { + ls.Blocks = s.Blocks + } + + // Use our own internal inode number and file owner. + if mask|linux.STATX_INO != 0 { + ls.Ino = i.ino + } + if mask|linux.STATX_UID != 0 { + ls.UID = uint32(i.uid) + } + if mask|linux.STATX_GID != 0 { + ls.GID = uint32(i.gid) + } + + return ls, nil +} + +// fstat is a best-effort fallback for inode.Stat() if the host does not +// support statx(2). +// +// We ignore the mask and sync flags in opts and simply supply +// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification +// of a mask or sync flags. fstat(2) does not provide any metadata +// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so +// those fields remain empty. +func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { + var s unix.Stat_t + if err := unix.Fstat(i.hostFD, &s); err != nil { return linux.Statx{}, err } - ls := unixToLinuxStatx(s) + + // Note that rdev numbers are left as 0; do not expose host device numbers. + ls := linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + Mode: uint16(s.Mode), + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + } // Use our own internal inode number and file owner. // @@ -159,9 +265,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro ls.UID = uint32(i.uid) ls.GID = uint32(i.gid) - // Update file mode from the host. - i.mode = linux.FileMode(ls.Mode) - return ls, nil } @@ -217,7 +320,6 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio } func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { - fileType := i.mode.FileType() if fileType == syscall.S_IFSOCK { if i.isTTY { @@ -227,6 +329,8 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error return nil, errors.New("importing host sockets not supported") } + // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that + // we don't allow importing arbitrary file types without proper support. if i.isTTY { // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. return nil, errors.New("importing host fd as TTY not supported") diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index e1ccacb4d..d519feef5 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -35,34 +35,14 @@ func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { } } -func unixToLinuxStatx(s unix.Statx_t) linux.Statx { - return linux.Statx{ - Mask: s.Mask, - Blksize: s.Blksize, - Attributes: s.Attributes, - Nlink: s.Nlink, - UID: s.Uid, - GID: s.Gid, - Mode: s.Mode, - Ino: s.Ino, - Size: s.Size, - Blocks: s.Blocks, - AttributesMask: s.Attributes_mask, - Atime: unixToLinuxStatxTimestamp(s.Atime), - Btime: unixToLinuxStatxTimestamp(s.Btime), - Ctime: unixToLinuxStatxTimestamp(s.Ctime), - Mtime: unixToLinuxStatxTimestamp(s.Mtime), - RdevMajor: s.Rdev_major, - RdevMinor: s.Rdev_minor, - DevMajor: s.Dev_major, - DevMinor: s.Dev_minor, - } -} - func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} } +func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} +} + // wouldBlock returns true for file types that can return EWOULDBLOCK // for blocking operations, e.g. pipes, character devices, and sockets. func wouldBlock(fileType uint32) bool { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1d627564f..6feda8fa1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error { // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil @@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.PauseTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + }) + } } } k.timekeeper.PauseUpdates() @@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.ResumeTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + }) + } } } } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 9bd2df104..a11a87cd1 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -136,7 +136,10 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall mask := args[3].Uint() statxAddr := args[4].Pointer() - if mask&linux.STATX__RESERVED > 0 { + if mask&linux.STATX__RESERVED != 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 { return 0, nil, syserror.EINVAL } if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index a74ea6fd5..97eaedd66 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -150,7 +150,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall mask := args[3].Uint() statxAddr := args[4].Pointer() - if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { + return 0, nil, syserror.EINVAL + } + + if mask&linux.STATX__RESERVED != 0 { return 0, nil, syserror.EINVAL } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a4627905e..f459d1973 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -284,6 +284,7 @@ var allowedSyscalls = seccomp.SyscallRules{ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index c951ac3b3..513b9cd1c 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -607,7 +607,7 @@ int statx(int dirfd, const char* pathname, int flags, unsigned int mask, } TEST_F(StatTest, StatxAbsPath) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); struct kernel_statx stx; @@ -617,7 +617,7 @@ TEST_F(StatTest, StatxAbsPath) { } TEST_F(StatTest, StatxRelPathDirFD) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); struct kernel_statx stx; @@ -631,7 +631,7 @@ TEST_F(StatTest, StatxRelPathDirFD) { } TEST_F(StatTest, StatxRelPathCwd) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); @@ -643,7 +643,7 @@ TEST_F(StatTest, StatxRelPathCwd) { } TEST_F(StatTest, StatxEmptyPath) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY)); @@ -653,6 +653,58 @@ TEST_F(StatTest, StatxEmptyPath) { EXPECT_TRUE(S_ISREG(stx.stx_mode)); } +TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + // Set all mask bits except for STATX__RESERVED. + uint mask = 0xffffffff & ~0x80000000; + EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISREG(stx.stx_mode)); +} + +TEST_F(StatTest, StatxRejectsReservedMaskBit) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + // Set STATX__RESERVED in the mask. + EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx), + SyscallFailsWithErrno(EINVAL)); +} + +TEST_F(StatTest, StatxSymlink) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + std::string parent_dir = "/tmp"; + TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(parent_dir, test_file_name_)); + std::string p = link.path(); + + struct kernel_statx stx; + EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISLNK(stx.stx_mode)); + EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISREG(stx.stx_mode)); +} + +TEST_F(StatTest, StatxInvalidFlags) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), + 0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx), + SyscallFailsWithErrno(EINVAL)); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 3a42638a0b32ceede66d8d593609b424bbdba47e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 18 Mar 2020 19:08:46 -0700 Subject: Port imported TTY fds to vfs2. Refactor fs/host.TTYFileOperations so that the relevant functionality can be shared with VFS2 (fsimpl/host.ttyFD). Incorporate host.defaultFileFD into the default host.fileDescription. This way, there is no need for a separate default_file.go. As in vfs1, the TTY file implementation can be built on top of this default and override operations as necessary (PRead/Read/PWrite/Write, Release, Ioctl). Note that these changes still need to be plumbed into runsc, which refers to imported TTYs in control/proc.go:ExecAsync. Updates #1672. PiperOrigin-RevId: 301718157 --- pkg/sentry/fs/host/ioctl_unsafe.go | 4 + pkg/sentry/fs/host/tty.go | 5 + pkg/sentry/fs/host/util.go | 8 +- pkg/sentry/fsimpl/host/BUILD | 6 +- pkg/sentry/fsimpl/host/default_file.go | 247 --------------------- pkg/sentry/fsimpl/host/host.go | 258 ++++++++++++++++++++-- pkg/sentry/fsimpl/host/ioctl_unsafe.go | 56 +++++ pkg/sentry/fsimpl/host/tty.go | 379 +++++++++++++++++++++++++++++++++ 8 files changed, 692 insertions(+), 271 deletions(-) delete mode 100644 pkg/sentry/fsimpl/host/default_file.go create mode 100644 pkg/sentry/fsimpl/host/ioctl_unsafe.go create mode 100644 pkg/sentry/fsimpl/host/tty.go (limited to 'pkg/sentry/fsimpl/host/BUILD') diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go index 271582e54..150ac8e19 100644 --- a/pkg/sentry/fs/host/ioctl_unsafe.go +++ b/pkg/sentry/fs/host/ioctl_unsafe.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" ) +// LINT.IfChange + func ioctlGetTermios(fd int) (*linux.Termios, error) { var t linux.Termios _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) @@ -54,3 +56,5 @@ func ioctlSetWinsize(fd int, w *linux.Winsize) error { } return nil } + +// LINT.ThenChange(../../fsimpl/host/ioctl_unsafe.go) diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 3f218b4a7..cb91355ab 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -26,6 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // TTYFileOperations implements fs.FileOperations for a host file descriptor // that wraps a TTY FD. // @@ -43,6 +45,7 @@ type TTYFileOperations struct { // connected to this TTY. fgProcessGroup *kernel.ProcessGroup + // termios contains the terminal attributes for this TTY. termios linux.KernelTermios } @@ -357,3 +360,5 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return kernel.ERESTARTSYS } + +// LINT.ThenChange(../../fsimpl/host/tty.go) diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index 388108fdf..1b0356930 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -23,7 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -80,9 +80,9 @@ func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr { Usage: s.Blocks * 512, Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), Owner: owner(s), - AccessTime: time.FromUnix(s.Atim.Sec, s.Atim.Nsec), - ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), - StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), Links: uint64(s.Nlink), } } diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 5d67f88e3..0bb4a5c3e 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -5,8 +5,9 @@ licenses(["notice"]) go_library( name = "host", srcs = [ - "default_file.go", "host.go", + "ioctl_unsafe.go", + "tty.go", "util.go", ], visibility = ["//pkg/sentry:internal"], @@ -17,9 +18,12 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", + "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go deleted file mode 100644 index 459238603..000000000 --- a/pkg/sentry/fsimpl/host/default_file.go +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package host - -import ( - "math" - "syscall" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files. -type defaultFileFD struct { - fileDescription - - // canMap specifies whether we allow the file to be memory mapped. - canMap bool - - // mu protects the fields below. - mu sync.Mutex - - // offset specifies the current file offset. - offset int64 -} - -// TODO(gvisor.dev/issue/1672): Implement Waitable interface. - -// PRead implements FileDescriptionImpl. -func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags)) -} - -// Read implements FileDescriptionImpl. -func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - // These files can't be memory mapped, assert this. - if f.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) - if isBlockError(err) { - // If we got any data at all, return it as a "completed" partial read - // rather than retrying until complete. - if n != 0 { - err = nil - } else { - err = syserror.ErrWouldBlock - } - } - return n, err - } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - f.mu.Lock() - n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags)) - f.offset += n - f.mu.Unlock() - return n, err -} - -func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. - if flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - var reader safemem.Reader - if offset == -1 { - reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} - } else { - reader = safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv(hostFD, srcs, offset) - return int64(n), err - }, - } - } - n, err := dst.CopyOutFrom(ctx, reader) - return int64(n), err -} - -// PWrite implements FileDescriptionImpl. -func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags)) -} - -// Write implements FileDescriptionImpl. -func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - // These files can't be memory mapped, assert this. - if f.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) - if isBlockError(err) { - err = syserror.ErrWouldBlock - } - return n, err - } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. - f.mu.Lock() - n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags)) - f.offset += n - f.mu.Unlock() - return n, err -} - -func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. - if flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) - if err != nil { - return 0, err - } - src = src.TakeFirst64(limit) - - var writer safemem.Writer - if offset == -1 { - writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} - } else { - writer = safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev(hostFD, srcs, offset) - return int64(n), err - }, - } - } - n, err := src.CopyInTo(ctx, writer) - return int64(n), err -} - -// Seek implements FileDescriptionImpl. -// -// Note that we do not support seeking on directories, since we do not even -// allow directory fds to be imported at all. -func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) { - // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - f.mu.Lock() - defer f.mu.Unlock() - - switch whence { - case linux.SEEK_SET: - if offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset = offset - - case linux.SEEK_CUR: - // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. - if offset > math.MaxInt64-f.offset { - return f.offset, syserror.EOVERFLOW - } - if f.offset+offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset += offset - - case linux.SEEK_END: - var s syscall.Stat_t - if err := syscall.Fstat(f.inode.hostFD, &s); err != nil { - return f.offset, err - } - size := s.Size - - // Check for overflow. Note that underflow cannot occur, since size >= 0. - if offset > math.MaxInt64-size { - return f.offset, syserror.EOVERFLOW - } - if size+offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset = size + offset - - case linux.SEEK_DATA, linux.SEEK_HOLE: - // Modifying the offset in the host file table should not matter, since - // this is the only place where we use it. - // - // For reading and writing, we always rely on our internal offset. - n, err := unix.Seek(f.inode.hostFD, offset, int(whence)) - if err != nil { - return f.offset, err - } - f.offset = n - - default: - // Invalid whence. - return f.offset, syserror.EINVAL - } - - return f.offset, nil -} - -// Sync implements FileDescriptionImpl. -func (f *defaultFileFD) Sync(context.Context) error { - // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. - return unix.Fsync(f.inode.hostFD) -} - -// ConfigureMMap implements FileDescriptionImpl. -func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { - if !f.canMap { - return syserror.ENODEV - } - // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. - return syserror.ENODEV -} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 2eebcd60c..3afb41395 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -19,18 +19,23 @@ package host import ( "errors" "fmt" + "math" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // filesystem implements vfs.FilesystemImpl. @@ -70,10 +75,20 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID hostFD: hostFD, isStream: isStream, isTTY: isTTY, + canMap: canMap(uint32(fileType)), ino: fs.NextIno(), mode: fileMode, uid: ownerUID, gid: ownerGID, + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + offset: 0, + } + + // These files can't be memory mapped, assert this. + if i.isStream && i.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } d := &kernfs.Dentry{} @@ -110,12 +125,17 @@ type inode struct { // This field is initialized at creation time and is immutable. isTTY bool + // canMap specifies whether we allow the file to be memory mapped. + // + // This field is initialized at creation time and is immutable. + canMap bool + // ino is an inode number unique within this filesystem. + // + // This field is initialized at creation time and is immutable. ino uint64 - // mu protects the inode metadata below. - // TODO(gvisor.dev/issue/1672): actually protect fields below. - //mu sync.Mutex + // TODO(gvisor.dev/issue/1672): protect mode, uid, and gid with mutex. // mode is the file mode of this inode. Note that this value may become out // of date if the mode is changed on the host, e.g. with chmod. @@ -125,6 +145,12 @@ type inode struct { // file created on import, not the fd on the host. uid auth.KUID gid auth.KGID + + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. + offset int64 } // Note that these flags may become out of date, since they can be modified @@ -336,36 +362,40 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that // we don't allow importing arbitrary file types without proper support. + var ( + vfsfd *vfs.FileDescription + fdImpl vfs.FileDescriptionImpl + ) if i.isTTY { - // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. - return nil, errors.New("importing host fd as TTY not supported") - } - - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. - fd := &defaultFileFD{ - fileDescription: fileDescription{ - inode: i, - }, - canMap: canMap(uint32(fileType)), - mu: sync.Mutex{}, - offset: 0, + fd := &ttyFD{ + fileDescription: fileDescription{inode: i}, + termios: linux.DefaultSlaveTermios, + } + vfsfd = &fd.vfsfd + fdImpl = fd + } else { + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + fd := &fileDescription{inode: i} + vfsfd = &fd.vfsfd + fdImpl = fd } - vfsfd := &fd.vfsfd flags, err := fileFlagsFromHostFD(i.hostFD) if err != nil { return nil, err } - if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := vfsfd.Init(fdImpl, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. +// +// TODO(gvisor.dev/issue/1672): Implement Waitable interface. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -394,3 +424,193 @@ func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.S func (f *fileDescription) Release() { // noop } + +// PRead implements FileDescriptionImpl. +func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + return 0, syserror.ESPIPE + } + + return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) +} + +// Read implements FileDescriptionImpl. +func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + i.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags) + i.offset += n + i.offsetMu.Unlock() + return n, err +} + +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + var reader safemem.Reader + if offset == -1 { + reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} + } else { + reader = safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv(hostFD, srcs, offset) + return int64(n), err + }, + } + } + n, err := dst.CopyOutFrom(ctx, reader) + return int64(n), err +} + +// PWrite implements FileDescriptionImpl. +func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + return 0, syserror.ESPIPE + } + + return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags) +} + +// Write implements FileDescriptionImpl. +func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + i.offsetMu.Lock() + n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags) + i.offset += n + i.offsetMu.Unlock() + return n, err +} + +func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + var writer safemem.Writer + if offset == -1 { + writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} + } else { + writer = safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev(hostFD, srcs, offset) + return int64(n), err + }, + } + } + n, err := src.CopyInTo(ctx, writer) + return int64(n), err +} + +// Seek implements FileDescriptionImpl. +// +// Note that we do not support seeking on directories, since we do not even +// allow directory fds to be imported at all. +func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. + if i.isStream { + return 0, syserror.ESPIPE + } + + i.offsetMu.Lock() + defer i.offsetMu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset = offset + + case linux.SEEK_CUR: + // Check for overflow. Note that underflow cannot occur, since i.offset >= 0. + if offset > math.MaxInt64-i.offset { + return i.offset, syserror.EOVERFLOW + } + if i.offset+offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset += offset + + case linux.SEEK_END: + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return i.offset, err + } + size := s.Size + + // Check for overflow. Note that underflow cannot occur, since size >= 0. + if offset > math.MaxInt64-size { + return i.offset, syserror.EOVERFLOW + } + if size+offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset = size + offset + + case linux.SEEK_DATA, linux.SEEK_HOLE: + // Modifying the offset in the host file table should not matter, since + // this is the only place where we use it. + // + // For reading and writing, we always rely on our internal offset. + n, err := unix.Seek(i.hostFD, offset, int(whence)) + if err != nil { + return i.offset, err + } + i.offset = n + + default: + // Invalid whence. + return i.offset, syserror.EINVAL + } + + return i.offset, nil +} + +// Sync implements FileDescriptionImpl. +func (f *fileDescription) Sync(context.Context) error { + // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. + return unix.Fsync(f.inode.hostFD) +} + +// ConfigureMMap implements FileDescriptionImpl. +func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { + if !f.inode.canMap { + return syserror.ENODEV + } + // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. + return syserror.ENODEV +} diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go new file mode 100644 index 000000000..0983bf7d8 --- /dev/null +++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} + +func ioctlGetWinsize(fd int) (*linux.Winsize, error) { + var w linux.Winsize + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w))) + if errno != 0 { + return nil, errno + } + return &w, nil +} + +func ioctlSetWinsize(fd int, w *linux.Winsize) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go new file mode 100644 index 000000000..8936afb06 --- /dev/null +++ b/pkg/sentry/fsimpl/host/tty.go @@ -0,0 +1,379 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor +// that wraps a TTY FD. +type ttyFD struct { + fileDescription + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // session is the session attached to this ttyFD. + session *kernel.Session + + // fgProcessGroup is the foreground process group that is currently + // connected to this TTY. + fgProcessGroup *kernel.ProcessGroup + + // termios contains the terminal attributes for this TTY. + termios linux.KernelTermios +} + +// InitForegroundProcessGroup sets the foreground process group and session for +// the TTY. This should only be called once, after the foreground process group +// has been created, but before it has started running. +func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { + t.mu.Lock() + defer t.mu.Unlock() + if t.fgProcessGroup != nil { + panic("foreground process group is already set") + } + t.fgProcessGroup = pg + t.session = pg.Session() +} + +// ForegroundProcessGroup returns the foreground process for the TTY. +func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup { + t.mu.Lock() + defer t.mu.Unlock() + return t.fgProcessGroup +} + +// Release implements fs.FileOperations.Release. +func (t *ttyFD) Release() { + t.mu.Lock() + t.fgProcessGroup = nil + t.mu.Unlock() + + t.fileDescription.Release() +} + +// PRead implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl. +func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.Write(ctx, src, opts) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Ignore arg[0]. This is the real FD: + fd := t.inode.hostFD + ioctl := args[1].Uint64() + switch ioctl { + case linux.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: + t.mu.Lock() + defer t.mu.Unlock() + + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + if err == nil { + t.termios.FromTermios(termios) + } + return 0, err + + case linux.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on this + // terminal. + + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Map the ProcessGroup into a ProcessGroupID in the task's PID namespace. + pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Check that we are allowed to set the process group. + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() + // to -ENOTTY. + if err == syserror.EIO { + return 0, syserror.ENOTTY + } + return 0, err + } + + // Check that calling task's process group is in the TTY session. + if task.ThreadGroup().Session() != t.session { + return 0, syserror.ENOTTY + } + + var pgID kernel.ProcessGroupID + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + // pgID must be non-negative. + if pgID < 0 { + return 0, syserror.EINVAL + } + + // Process group with pgID must exist in this PID namespace. + pidns := task.PIDNamespace() + pg := pidns.ProcessGroupWithID(pgID) + if pg == nil { + return 0, syserror.ESRCH + } + + // Check that new process group is in the TTY session. + if pg.Session() != t.session { + return 0, syserror.EPERM + } + + t.fgProcessGroup = pg + return 0, nil + + case linux.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := ioctlGetWinsize(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + + // Unlike setting the termios, any process group (even background ones) can + // set the winsize. + + var winsize linux.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetWinsize(fd, &winsize) + return 0, err + + // Unimplemented commands. + case linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCNOTTY, + linux.TIOCSCTTY, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + fallthrough + default: + return 0, syserror.ENOTTY + } +} + +// checkChange checks that the process group is allowed to read, write, or +// change the state of the TTY. +// +// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic +// is a bit convoluted, but documented inline. +// +// Preconditions: t.mu must be held. +func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + // No task? Linux does not have an analog for this case, but + // tty_check_change is more of a blacklist of cases than a + // whitelist, and is surprisingly permissive. Allowing the + // change seems most appropriate. + return nil + } + + tg := task.ThreadGroup() + pg := tg.ProcessGroup() + + // If the session for the task is different than the session for the + // controlling TTY, then the change is allowed. Seems like a bad idea, + // but that's exactly what linux does. + if tg.Session() != t.fgProcessGroup.Session() { + return nil + } + + // If we are the foreground process group, then the change is allowed. + if pg == t.fgProcessGroup { + return nil + } + + // We are not the foreground process group. + + // Is the provided signal blocked or ignored? + if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) { + // If the signal is SIGTTIN, then we are attempting to read + // from the TTY. Don't send the signal and return EIO. + if sig == linux.SIGTTIN { + return syserror.EIO + } + + // Otherwise, we are writing or changing terminal state. This is allowed. + return nil + } + + // If the process group is an orphan, return EIO. + if pg.IsOrphan() { + return syserror.EIO + } + + // Otherwise, send the signal to the process group and return ERESTARTSYS. + // + // Note that Linux also unconditionally sets TIF_SIGPENDING on current, + // but this isn't necessary in gVisor because the rationale given in + // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't + // apply: the sentry will handle -ERESTARTSYS in + // kernel.runApp.execute() even if the kernel.Task isn't interrupted. + // + // Linux ignores the result of kill_pgrp(). + _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) + return kernel.ERESTARTSYS +} -- cgit v1.2.3 From 248e46f320525704da917e148a8f69d9b74671a0 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 19 Mar 2020 23:29:15 -0700 Subject: Whitelist utimensat(2). utimensat is used by hostfs for setting timestamps on imported fds. Previously, this would crash the sandbox since utimensat was not allowed. Correct the VFS2 version of hostfs to match the call in VFS1. PiperOrigin-RevId: 301970121 --- pkg/sentry/fsimpl/host/BUILD | 1 + pkg/sentry/fsimpl/host/host.go | 4 ++-- pkg/sentry/fsimpl/host/util.go | 8 ++++---- pkg/sentry/fsimpl/host/util_unsafe.go | 34 ++++++++++++++++++++++++++++++++++ runsc/boot/filter/config.go | 8 ++++++++ 5 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 pkg/sentry/fsimpl/host/util_unsafe.go (limited to 'pkg/sentry/fsimpl/host/BUILD') diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 0bb4a5c3e..82e1fb74b 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -9,6 +9,7 @@ go_library( "ioctl_unsafe.go", "tty.go", "util.go", + "util_unsafe.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 3afb41395..1f735628f 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -322,11 +322,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { - timestamps := []unix.Timespec{ + ts := [2]syscall.Timespec{ toTimespec(s.Atime, m&linux.STATX_ATIME == 0), toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), } - if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil { + if err := setTimestamps(i.hostFD, &ts); err != nil { return err } } diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index d519feef5..2bc757b1a 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -22,15 +22,15 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { +func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec { if omit { - return unix.Timespec{ + return syscall.Timespec{ Sec: 0, Nsec: unix.UTIME_OMIT, } } - return unix.Timespec{ - Sec: int64(ts.Sec), + return syscall.Timespec{ + Sec: ts.Sec, Nsec: int64(ts.Nsec), } } diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go new file mode 100644 index 000000000..5136ac844 --- /dev/null +++ b/pkg/sentry/fsimpl/host/util_unsafe.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +func setTimestamps(fd int, ts *[2]syscall.Timespec) error { + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(ts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index f459d1973..06b9f888a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -291,6 +291,14 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(uint64(os.Getpid())), }, }, + syscall.SYS_UTIMENSAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* null pathname */ + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* flags */ + }, + }, syscall.SYS_WRITE: {}, // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR -- cgit v1.2.3 From e0c67014cb2200ad58cd28b12fddb3f55652a21b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 23 Apr 2020 11:06:59 -0700 Subject: Factor fsimpl/gofer.host{Preadv,Pwritev} out of fsimpl/gofer. Also fix returning EOF when 0 bytes are read. PiperOrigin-RevId: 308089875 --- pkg/sentry/fsimpl/gofer/BUILD | 2 +- pkg/sentry/fsimpl/gofer/handle.go | 5 +- pkg/sentry/fsimpl/gofer/handle_unsafe.go | 66 ------------------- pkg/sentry/fsimpl/host/BUILD | 3 +- pkg/sentry/fsimpl/host/host.go | 31 ++------- pkg/sentry/hostfd/BUILD | 17 +++++ pkg/sentry/hostfd/hostfd.go | 84 ++++++++++++++++++++++++ pkg/sentry/hostfd/hostfd_unsafe.go | 107 +++++++++++++++++++++++++++++++ 8 files changed, 218 insertions(+), 97 deletions(-) delete mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go create mode 100644 pkg/sentry/hostfd/BUILD create mode 100644 pkg/sentry/hostfd/hostfd.go create mode 100644 pkg/sentry/hostfd/hostfd_unsafe.go (limited to 'pkg/sentry/fsimpl/host/BUILD') diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index acd061905..b9c4beee4 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -35,7 +35,6 @@ go_library( "fstree.go", "gofer.go", "handle.go", - "handle_unsafe.go", "p9file.go", "pagemath.go", "regular_file.go", @@ -53,6 +52,7 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/hostfd", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index cfe66f797..724a3f1f7 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // handle represents a remote "open file descriptor", consisting of an opened @@ -77,7 +78,7 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPreadv(h.fd, dsts, int64(offset)) + n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } @@ -103,7 +104,7 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPwritev(h.fd, srcs, int64(offset)) + n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go deleted file mode 100644 index 19560ab26..000000000 --- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/safemem" -) - -// Preconditions: !dsts.IsEmpty(). -func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if dsts.NumBlocks() == 1 { - // Use pread() instead of preadv() to avoid iovec allocation and - // copying. - dst := dsts.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(dsts) - n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} - -// Preconditions: !srcs.IsEmpty(). -func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if srcs.NumBlocks() == 1 { - // Use pwrite() instead of pwritev() to avoid iovec allocation and - // copying. - src := srcs.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(srcs) - n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 82e1fb74b..44dd9f672 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -15,12 +15,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", - "//pkg/fd", "//pkg/log", "//pkg/refs", - "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index fe14476f1..ae94cfa6e 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -25,11 +25,10 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -492,19 +491,9 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off if flags != 0 { return 0, syserror.EOPNOTSUPP } - - var reader safemem.Reader - if offset == -1 { - reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} - } else { - reader = safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv(hostFD, srcs, offset) - return int64(n), err - }, - } - } + reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) return int64(n), err } @@ -542,19 +531,9 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs if flags != 0 { return 0, syserror.EOPNOTSUPP } - - var writer safemem.Writer - if offset == -1 { - writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} - } else { - writer = safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev(hostFD, srcs, offset) - return int64(n), err - }, - } - } + writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) return int64(n), err } diff --git a/pkg/sentry/hostfd/BUILD b/pkg/sentry/hostfd/BUILD new file mode 100644 index 000000000..364a78306 --- /dev/null +++ b/pkg/sentry/hostfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "hostfd", + srcs = [ + "hostfd.go", + "hostfd_unsafe.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/safemem", + "//pkg/sync", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/hostfd/hostfd.go b/pkg/sentry/hostfd/hostfd.go new file mode 100644 index 000000000..70dd9cafb --- /dev/null +++ b/pkg/sentry/hostfd/hostfd.go @@ -0,0 +1,84 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostfd provides efficient I/O with host file descriptors. +package hostfd + +import ( + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sync" +) + +// ReadWriterAt implements safemem.Reader and safemem.Writer by reading from +// and writing to a host file descriptor respectively. ReadWriterAts should be +// obtained by calling GetReadWriterAt. +// +// Clients should usually prefer to use Preadv2 and Pwritev2 directly. +type ReadWriterAt struct { + fd int32 + offset int64 + flags uint32 +} + +var rwpool = sync.Pool{ + New: func() interface{} { + return &ReadWriterAt{} + }, +} + +// GetReadWriterAt returns a ReadWriterAt that reads from / writes to the given +// host file descriptor, starting at the given offset and using the given +// preadv2(2)/pwritev2(2) flags. If offset is -1, the host file descriptor's +// offset is used instead. Users are responsible for ensuring that fd remains +// valid for the lifetime of the returned ReadWriterAt, and must call +// PutReadWriterAt when it is no longer needed. +func GetReadWriterAt(fd int32, offset int64, flags uint32) *ReadWriterAt { + rw := rwpool.Get().(*ReadWriterAt) + *rw = ReadWriterAt{ + fd: fd, + offset: offset, + flags: flags, + } + return rw +} + +// PutReadWriterAt releases a ReadWriterAt returned by a previous call to +// GetReadWriterAt that is no longer in use. +func PutReadWriterAt(rw *ReadWriterAt) { + rwpool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *ReadWriterAt) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + n, err := Preadv2(rw.fd, dsts, rw.offset, rw.flags) + if rw.offset >= 0 { + rw.offset += int64(n) + } + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *ReadWriterAt) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + n, err := Pwritev2(rw.fd, srcs, rw.offset, rw.flags) + if rw.offset >= 0 { + rw.offset += int64(n) + } + return n, err +} diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go new file mode 100644 index 000000000..5e9e60fc4 --- /dev/null +++ b/pkg/sentry/hostfd/hostfd_unsafe.go @@ -0,0 +1,107 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostfd + +import ( + "io" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/safemem" +) + +// Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into +// dsts. offset and flags are interpreted as for preadv2(2). +// +// Preconditions: !dsts.IsEmpty(). +func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + var ( + n uintptr + e syscall.Errno + ) + // Avoid preadv2(2) if possible, since it's relatively new and thus least + // likely to be supported by the host kernel. + if flags == 0 { + if dsts.NumBlocks() == 1 { + // Use read() or pread() to avoid iovec allocation and copying. + dst := dsts.Head() + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len())) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } else { + iovs := safemem.IovecsFromBlockSeq(dsts) + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_READV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs))) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } + } else { + iovs := safemem.IovecsFromBlockSeq(dsts) + n, _, e = syscall.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags)) + } + if e != 0 { + return 0, e + } + if n == 0 { + return 0, io.EOF + } + return uint64(n), nil +} + +// Pwritev2 writes up to srcs.NumBytes() from srcs into host file descriptor +// fd. offset and flags are interpreted as for pwritev2(2). +// +// Preconditions: !srcs.IsEmpty(). +func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + var ( + n uintptr + e syscall.Errno + ) + // Avoid pwritev2(2) if possible, since it's relatively new and thus least + // likely to be supported by the host kernel. + if flags == 0 { + if srcs.NumBlocks() == 1 { + // Use write() or pwrite() to avoid iovec allocation and copying. + src := srcs.Head() + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len())) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } else { + iovs := safemem.IovecsFromBlockSeq(srcs) + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_WRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs))) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } + } else { + iovs := safemem.IovecsFromBlockSeq(srcs) + n, _, e = syscall.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags)) + } + if e != 0 { + return 0, e + } + return uint64(n), nil +} -- cgit v1.2.3 From f13f26d17da56d585fd9857a81175bbd0be8ce60 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 24 Apr 2020 13:45:31 -0700 Subject: Port SCM Rights to VFS2. Fixes #1477. PiperOrigin-RevId: 308317511 --- pkg/sentry/fs/host/control.go | 6 +- pkg/sentry/fsimpl/host/BUILD | 3 + pkg/sentry/fsimpl/host/control.go | 96 ++++++++++++++++++++++ pkg/sentry/socket/control/BUILD | 6 +- pkg/sentry/socket/control/control.go | 24 +++++- pkg/sentry/socket/control/control_vfs2.go | 131 ++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/socket.go | 2 +- 7 files changed, 260 insertions(+), 8 deletions(-) create mode 100644 pkg/sentry/fsimpl/host/control.go create mode 100644 pkg/sentry/socket/control/control_vfs2.go (limited to 'pkg/sentry/fsimpl/host/BUILD') diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 52c0504b6..39299b7e4 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) +// LINT.IfChange + type scmRights struct { fds []int } @@ -32,8 +34,6 @@ func newSCMRights(fds []int) control.SCMRights { } // Files implements control.SCMRights.Files. -// -// TODO(gvisor.dev/issue/2017): Port to VFS2. func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) { n := max var trunc bool @@ -93,3 +93,5 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File { } return files } + +// LINT.ThenChange(../../fsimpl/host/control.go) diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 44dd9f672..2dcb03a73 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -5,6 +5,7 @@ licenses(["notice"]) go_library( name = "host", srcs = [ + "control.go", "host.go", "ioctl_unsafe.go", "tty.go", @@ -23,6 +24,8 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go new file mode 100644 index 000000000..b9082a20f --- /dev/null +++ b/pkg/sentry/fsimpl/host/control.go @@ -0,0 +1,96 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRightsVFS2 { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(c.fds); n > l { + n = l + } else if n < l { + trunc = true + } + + rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (c *scmRights) Clone() transport.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements transport.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil { + ctx.Warningf("Error setting flags on host FD file: %v", err) + break + } + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 4d42d29cb..ca16d0381 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -4,7 +4,10 @@ package(licenses = ["notice"]) go_library( name = "control", - srcs = ["control.go"], + srcs = [ + "control.go", + "control_vfs2.go", + ], imports = [ "gvisor.dev/gvisor/pkg/sentry/fs", ], @@ -18,6 +21,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 8834a1e1a..8b439a078 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -41,6 +41,8 @@ type SCMCredentials interface { Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) } +// LINT.IfChange + // SCMRights represents a SCM_RIGHTS socket control message. type SCMRights interface { transport.RightsControlMessage @@ -142,6 +144,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flag return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) } +// LINT.ThenChange(./control_vfs2.go) + // scmCredentials represents an SCM_CREDENTIALS socket control message. // // +stateify savable @@ -537,11 +541,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con } if len(fds) > 0 { - rights, err := NewSCMRights(t, fds) - if err != nil { - return socket.ControlMessages{}, err + if kernel.VFS2Enabled { + rights, err := NewSCMRightsVFS2(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights + } else { + rights, err := NewSCMRights(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights } - cmsgs.Unix.Rights = rights } return cmsgs, nil @@ -566,6 +578,8 @@ func MakeCreds(t *kernel.Task) SCMCredentials { return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} } +// LINT.IfChange + // New creates default control messages if needed. func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages { return transport.ControlMessages{ @@ -573,3 +587,5 @@ func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transpo Rights: rights, } } + +// LINT.ThenChange(./control_vfs2.go) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go new file mode 100644 index 000000000..fd08179be --- /dev/null +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SCMRightsVFS2 represents a SCM_RIGHTS socket control message. +type SCMRightsVFS2 interface { + transport.RightsControlMessage + + // Files returns up to max RightsFiles. + // + // Returned files are consumed and ownership is transferred to the caller. + // Subsequent calls to Files will return the next files. + Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool) +} + +// RightsFiles represents a SCM_RIGHTS socket control message. A reference is +// maintained for each vfs.FileDescription and is release either when an FD is created or +// when the Release method is called. +type RightsFilesVFS2 []*vfs.FileDescription + +// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message +// representation using local sentry FDs. +func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { + files := make(RightsFilesVFS2, 0, len(fds)) + for _, fd := range fds { + file := t.GetFileVFS2(fd) + if file == nil { + files.Release() + return nil, syserror.EBADF + } + files = append(files, file) + } + return &files, nil +} + +// Files implements SCMRights.Files. +func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(*fs); n > l { + n = l + } else if n < l { + trunc = true + } + rf := (*fs)[:n] + *fs = (*fs)[n:] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { + nfs := append(RightsFilesVFS2(nil), *fs...) + for _, nf := range nfs { + nf.IncRef() + } + return &nfs +} + +// Release implements transport.RightsControlMessage.Release. +func (fs *RightsFilesVFS2) Release() { + for _, f := range *fs { + f.DecRef() + } + *fs = nil +} + +// rightsFDsVFS2 gets up to the specified maximum number of FDs. +func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) { + files, trunc := rights.Files(t, max) + fds := make([]int32, 0, len(files)) + for i := 0; i < max && len(files) > 0; i++ { + fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) + files[0].DecRef() + files = files[1:] + if err != nil { + t.Warningf("Error inserting FD: %v", err) + // This is what Linux does. + break + } + + fds = append(fds, int32(fd)) + } + return fds, trunc +} + +// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf. +func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) { + maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4 + // Linux does not return any FDs if none fit. + if maxFDs <= 0 { + flags |= linux.MSG_CTRUNC + return buf, flags + } + fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs) + if trunc { + flags |= linux.MSG_CTRUNC + } + align := t.Arch().Width() + return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) +} + +// NewVFS2 creates default control messages if needed. +func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages { + return transport.ControlMessages{ + Credentials: makeCreds(t, socketOrEndpoint), + Rights: rights, + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index b1ede32f0..10b668477 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -804,7 +804,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } if cms.Unix.Rights != nil { - controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) + controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) } // Copy the address to the caller. -- cgit v1.2.3