diff options
Diffstat (limited to 'pkg/sentry')
261 files changed, 14934 insertions, 2004 deletions
diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD index e8b794179..e759dc36f 100644 --- a/pkg/sentry/BUILD +++ b/pkg/sentry/BUILD @@ -1,13 +1,11 @@ -# This BUILD file defines a package_group that allows for interdependencies for -# sentry-internal packages. - package(licenses = ["notice"]) +# The "internal" package_group should be used as much as possible by packages +# that should remain Sentry-internal (i.e. not be exposed directly to command +# line tooling or APIs). package_group( name = "internal", packages = [ - "//cloud/gvisor/gopkg/sentry/...", - "//cloud/gvisor/sentry/...", "//pkg/sentry/...", "//runsc/...", # Code generated by go_marshal relies on go_marshal libraries. diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 34c0a867d..e27f21e5e 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -14,6 +14,7 @@ go_library( "arch_state_aarch64.go", "arch_state_x86.go", "arch_x86.go", + "arch_x86_impl.go", "auxv.go", "signal.go", "signal_act.go", diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 3b6987665..5053393c1 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -32,29 +32,38 @@ import ( const ( // SyscallWidth is the width of insturctions. SyscallWidth = 4 + + // fpsimdMagic is the magic number which is used in fpsimd_context. + fpsimdMagic = 0x46508001 + + // fpsimdContextSize is the size of fpsimd_context. + fpsimdContextSize = 0x210 ) +// ARMTrapFlag is the mask for the trap flag. +const ARMTrapFlag = uint64(1) << 21 + // aarch64FPState is aarch64 floating point state. type aarch64FPState []byte -// initAarch64FPState (defined in asm files) sets up initial state. -func initAarch64FPState(data *FloatingPointData) { - // TODO(gvisor.dev/issue/1238): floating-point is not supported. +// initAarch64FPState sets up initial state. +func initAarch64FPState(data aarch64FPState) { + binary.LittleEndian.PutUint32(data, fpsimdMagic) + binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize) } func newAarch64FPStateSlice() []byte { - return alignedBytes(4096, 32)[:4096] + return alignedBytes(4096, 16)[:fpsimdContextSize] } // newAarch64FPState returns an initialized floating point state. // // The returned state is large enough to store all floating point state // supported by host, even if the app won't use much of it due to a restricted -// FeatureSet. Since they may still be able to see state not advertised by -// CPUID we must ensure it does not contain any sentry state. +// FeatureSet. func newAarch64FPState() aarch64FPState { f := aarch64FPState(newAarch64FPStateSlice()) - initAarch64FPState(f.FloatingPointData()) + initAarch64FPState(f) return f } @@ -133,10 +142,10 @@ func (s State) Proto() *rpb.Registers { // Fork creates and returns an identical copy of the state. func (s *State) Fork() State { - // TODO(gvisor.dev/issue/1238): floating-point is not supported. return State{ - Regs: s.Regs, - FeatureSet: s.FeatureSet, + Regs: s.Regs, + aarch64FPState: s.aarch64FPState.fork(), + FeatureSet: s.FeatureSet, } } @@ -285,8 +294,10 @@ func New(arch Arch, fs *cpuid.FeatureSet) Context { case ARM64: return &context64{ State{ - FeatureSet: fs, + aarch64FPState: newAarch64FPState(), + FeatureSet: fs, }, + []aarch64FPState(nil), } } panic(fmt.Sprintf("unknown architecture %v", arch)) diff --git a/pkg/sentry/arch/arch_amd64.s b/pkg/sentry/arch/arch_amd64.s index bd61402cf..6c10336e7 100644 --- a/pkg/sentry/arch/arch_amd64.s +++ b/pkg/sentry/arch/arch_amd64.s @@ -26,10 +26,11 @@ // // func initX86FPState(data *FloatingPointData, useXsave bool) // -// We need to clear out and initialize an empty fp state area since the sentry -// may have left sensitive information in the floating point registers. +// We need to clear out and initialize an empty fp state area since the sentry, +// or any previous loader, may have left sensitive information in the floating +// point registers. // -// Preconditions: data is zeroed +// Preconditions: data is zeroed. TEXT ·initX86FPState(SB), $24-16 // Save MXCSR (callee-save) STMXCSR mxcsr-8(SP) diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 94f1a808f..885115ae2 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -53,6 +53,11 @@ const ( preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5 ) +var ( + // CPUIDInstruction doesn't exist on ARM64. + CPUIDInstruction = []byte{} +) + // These constants are selected as heuristics to help make the Platform's // potentially limited address space conform as closely to Linux as possible. const ( @@ -68,6 +73,7 @@ const ( // context64 represents an ARM64 context. type context64 struct { State + sigFPState []aarch64FPState // fpstate to be restored on sigreturn. } // Arch implements Context.Arch. @@ -75,10 +81,19 @@ func (c *context64) Arch() Arch { return ARM64 } +func (c *context64) copySigFPState() []aarch64FPState { + var sigfps []aarch64FPState + for _, s := range c.sigFPState { + sigfps = append(sigfps, s.fork()) + } + return sigfps +} + // Fork returns an exact copy of this context. func (c *context64) Fork() Context { return &context64{ - State: c.State.Fork(), + State: c.State.Fork(), + sigFPState: c.copySigFPState(), } } @@ -137,8 +152,8 @@ func (c *context64) SetTLS(value uintptr) bool { return false } -// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP. -func (c *context64) SetRSEQInterruptedIP(value uintptr) { +// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP. +func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { c.Regs.Regs[3] = uint64(value) } diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index d388ee9cf..e35c9214a 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -43,8 +43,8 @@ func (e ErrFloatingPoint) Error() string { // and SSE state, so this is the equivalent XSTATE_BV value. const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE -// afterLoad is invoked by stateify. -func (s *State) afterLoad() { +// afterLoadFPState is invoked by afterLoad. +func (s *State) afterLoadFPState() { old := s.x86FPState // Recreate the slice. This is done to ensure that it is aligned diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index a18093155..88b40a9d1 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -114,6 +114,10 @@ func newX86FPStateSlice() []byte { size, align := cpuid.HostFeatureSet().ExtendedStateSize() capacity := size // Always use at least 4096 bytes. + // + // For the KVM platform, this state is a fixed 4096 bytes, so make sure + // that the underlying array is at _least_ that size otherwise we will + // corrupt random memory. This is not a pleasant thing to debug. if capacity < 4096 { capacity = 4096 } @@ -151,21 +155,6 @@ func NewFloatingPointData() *FloatingPointData { return (*FloatingPointData)(&(newX86FPState()[0])) } -// State contains the common architecture bits for X86 (the build tag of this -// file ensures it's only built on x86). -// -// +stateify savable -type State struct { - // The system registers. - Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"` - - // Our floating point state. - x86FPState `state:"wait"` - - // FeatureSet is a pointer to the currently active feature set. - FeatureSet *cpuid.FeatureSet -} - // Proto returns a protobuf representation of the system registers in State. func (s State) Proto() *rpb.Registers { regs := &rpb.AMD64Registers{ diff --git a/pkg/sentry/arch/arch_x86_impl.go b/pkg/sentry/arch/arch_x86_impl.go new file mode 100644 index 000000000..04ac283c6 --- /dev/null +++ b/pkg/sentry/arch/arch_x86_impl.go @@ -0,0 +1,43 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 i386 + +package arch + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/cpuid" +) + +// State contains the common architecture bits for X86 (the build tag of this +// file ensures it's only built on x86). +// +// +stateify savable +type State struct { + // The system registers. + Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"` + + // Our floating point state. + x86FPState `state:"wait"` + + // FeatureSet is a pointer to the currently active feature set. + FeatureSet *cpuid.FeatureSet +} + +// afterLoad is invoked by stateify. +func (s *State) afterLoad() { + s.afterLoadFPState() +} diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 81b92bb43..6fb756f0e 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -55,7 +55,7 @@ type SignalContext64 struct { Trapno uint64 Oldmask linux.SignalSet Cr2 uint64 - // Pointer to a struct _fpstate. + // Pointer to a struct _fpstate. See b/33003106#comment8. Fpstate uint64 Reserved [8]uint64 } diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 4f4cc46a8..0c1db4b13 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -30,14 +30,29 @@ type SignalContext64 struct { Sp uint64 Pc uint64 Pstate uint64 - _pad [8]byte // __attribute__((__aligned__(16))) - Reserved [4096]uint8 + _pad [8]byte // __attribute__((__aligned__(16))) + Fpsimd64 FpsimdContext // size = 528 + Reserved [3568]uint8 +} + +type aarch64Ctx struct { + Magic uint32 + Size uint32 +} + +// FpsimdContext is equivalent to struct fpsimd_context on arm64 +// (arch/arm64/include/uapi/asm/sigcontext.h). +type FpsimdContext struct { + Head aarch64Ctx + Fpsr uint32 + Fpcr uint32 + Vregs [64]uint64 // actually [32]uint128 } // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h). type UContext64 struct { Flags uint64 - Link *UContext64 + Link uint64 Stack SignalStack Sigset linux.SignalSet // glibc uses a 1024-bit sigset_t diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index e69496477..d16d78aa5 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -16,10 +16,13 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", + "//pkg/fspath", "//pkg/log", "//pkg/sentry/fs", "//pkg/sentry/fs/host", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -27,8 +30,10 @@ go_library( "//pkg/sentry/state", "//pkg/sentry/strace", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", + "//pkg/syserror", "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 151808911..663e51989 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -117,9 +117,9 @@ func (p *Profile) HeapProfile(o *ProfileOpts, _ *struct{}) error { return nil } -// Goroutine is an RPC stub which dumps out the stack trace for all running -// goroutines. -func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error { +// GoroutineProfile is an RPC stub which dumps out the stack trace for all +// running goroutines. +func (p *Profile) GoroutineProfile(o *ProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { return errNoOutput } @@ -131,6 +131,34 @@ func (p *Profile) Goroutine(o *ProfileOpts, _ *struct{}) error { return nil } +// BlockProfile is an RPC stub which dumps out the stack trace that led to +// blocking on synchronization primitives. +func (p *Profile) BlockProfile(o *ProfileOpts, _ *struct{}) error { + if len(o.FilePayload.Files) < 1 { + return errNoOutput + } + output := o.FilePayload.Files[0] + defer output.Close() + if err := pprof.Lookup("block").WriteTo(output, 0); err != nil { + return err + } + return nil +} + +// MutexProfile is an RPC stub which dumps out the stack trace of holders of +// contended mutexes. +func (p *Profile) MutexProfile(o *ProfileOpts, _ *struct{}) error { + if len(o.FilePayload.Files) < 1 { + return errNoOutput + } + output := o.FilePayload.Files[0] + defer output.Close() + if err := pprof.Lookup("mutex").WriteTo(output, 0); err != nil { + return err + } + return nil +} + // StartTrace is an RPC stub which starts collection of an execution trace. func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error { if len(o.FilePayload.Files) < 1 { diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index ced51c66c..5457ba5e7 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -18,19 +18,26 @@ import ( "bytes" "encoding/json" "fmt" + "path" "sort" "strings" "text/tabwriter" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/urpc" ) @@ -60,6 +67,12 @@ type ExecArgs struct { // process's MountNamespace. MountNamespace *fs.MountNamespace + // MountNamespaceVFS2 is the mount namespace to execute the new process in. + // A reference on MountNamespace must be held for the lifetime of the + // ExecArgs. If MountNamespace is nil, it will default to the init + // process's MountNamespace. + MountNamespaceVFS2 *vfs.MountNamespace + // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` @@ -150,6 +163,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI Envv: args.Envv, WorkingDirectory: args.WorkingDirectory, MountNamespace: args.MountNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, Credentials: creds, FDTable: fdTable, Umask: 0022, @@ -166,24 +180,53 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } + if initArgs.MountNamespaceVFS2 != nil { + // initArgs must hold a reference on MountNamespaceVFS2, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespaceVFS2.IncRef() + } ctx := initArgs.NewContext(proc.Kernel) if initArgs.Filename == "" { - // Get the full path to the filename from the PATH env variable. - paths := fs.GetPath(initArgs.Envv) - mns := initArgs.MountNamespace - if mns == nil { - mns = proc.Kernel.GlobalInit().Leader().MountNamespace() - } - f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) - if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + if kernel.VFS2Enabled { + // Get the full path to the filename from the PATH env variable. + if initArgs.MountNamespaceVFS2 == nil { + // Set initArgs so that 'ctx' returns the namespace. + // + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() + } + + paths := fs.GetPath(initArgs.Envv) + vfsObj := proc.Kernel.VFS() + file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + if err != nil { + return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + } + initArgs.File = fsbridge.NewVFSFile(file) + } else { + // Get the full path to the filename from the PATH env variable. + paths := fs.GetPath(initArgs.Envv) + if initArgs.MountNamespace == nil { + // Set initArgs so that 'ctx' returns the namespace. + initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() + + // initArgs must hold a reference on MountNamespace, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespaceVFS2.IncRef() + } + f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + if err != nil { + return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + } + initArgs.Filename = f } - initArgs.Filename = f } mounter := fs.FileOwnerFromContext(ctx) + // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2. var ttyFile *fs.File for appFD, hostFile := range args.FilePayload.Files { var appFile *fs.File @@ -411,3 +454,67 @@ func ttyName(tty *kernel.TTY) string { } return fmt.Sprintf("pts/%d", tty.Index) } + +// ResolveExecutablePath resolves the given executable name given a set of +// paths that might contain it. +func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + creds := auth.CredentialsFromContext(ctx) + + // Absolute paths can be used directly. + if path.IsAbs(name) { + return openExecutable(ctx, vfsObj, creds, root, name) + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if len(wd) == 0 { + wd = "/" + } + if !path.IsAbs(wd) { + return nil, fmt.Errorf("working directory %q must be absolute", wd) + } + return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name)) + } + + // Otherwise, we must lookup the name in the paths, starting from the + // calling context's root directory. + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + f, err := openExecutable(ctx, vfsObj, creds, root, binPath) + if err != nil { + return nil, err + } + if f == nil { + continue // Not found/no access. + } + return f, nil + } + return nil, syserror.ENOENT +} + +func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) { + pop := vfs.PathOperation{ + Root: root, + Start: root, // binPath is absolute, Start can be anything. + Path: fspath.Parse(path), + FollowFinalSymlink: true, + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + } + f, err := vfsObj.OpenAt(ctx, creds, &pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + return nil, nil + } + return f, err +} diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD new file mode 100644 index 000000000..abe58f818 --- /dev/null +++ b/pkg/sentry/devices/memdev/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "memdev", + srcs = [ + "full.go", + "memdev.go", + "null.go", + "random.go", + "zero.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/rand", + "//pkg/safemem", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/pgalloc", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go new file mode 100644 index 000000000..c7e197691 --- /dev/null +++ b/pkg/sentry/devices/memdev/full.go @@ -0,0 +1,75 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const fullDevMinor = 7 + +// fullDevice implements vfs.Device for /dev/full. +type fullDevice struct{} + +// Open implements vfs.Device.Open. +func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &fullFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// fullFD implements vfs.FileDescriptionImpl for /dev/full. +type fullFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fullFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSPC +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSPC +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} diff --git a/pkg/sentry/devices/memdev/memdev.go b/pkg/sentry/devices/memdev/memdev.go new file mode 100644 index 000000000..5759900c4 --- /dev/null +++ b/pkg/sentry/devices/memdev/memdev.go @@ -0,0 +1,59 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memdev implements "mem" character devices, as implemented in Linux +// by drivers/char/mem.c and drivers/char/random.c. +package memdev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Register registers all devices implemented by this package in vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + for minor, dev := range map[uint32]vfs.Device{ + nullDevMinor: nullDevice{}, + zeroDevMinor: zeroDevice{}, + fullDevMinor: fullDevice{}, + randomDevMinor: randomDevice{}, + urandomDevMinor: randomDevice{}, + } { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MEM_MAJOR, minor, dev, &vfs.RegisterDeviceOptions{ + GroupName: "mem", + }); err != nil { + return err + } + } + return nil +} + +// CreateDevtmpfsFiles creates device special files in dev representing all +// devices implemented by this package. +func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { + for minor, name := range map[uint32]string{ + nullDevMinor: "null", + zeroDevMinor: "zero", + fullDevMinor: "full", + randomDevMinor: "random", + urandomDevMinor: "urandom", + } { + if err := dev.CreateDeviceFile(ctx, name, vfs.CharDevice, linux.MEM_MAJOR, minor, 0666 /* mode */); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go new file mode 100644 index 000000000..33d060d02 --- /dev/null +++ b/pkg/sentry/devices/memdev/null.go @@ -0,0 +1,76 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "io" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +const nullDevMinor = 3 + +// nullDevice implements vfs.Device for /dev/null. +type nullDevice struct{} + +// Open implements vfs.Device.Open. +func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &nullFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// nullFD implements vfs.FileDescriptionImpl for /dev/null. +type nullFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *nullFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *nullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, io.EOF +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *nullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, io.EOF +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *nullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *nullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *nullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go new file mode 100644 index 000000000..acfa23149 --- /dev/null +++ b/pkg/sentry/devices/memdev/random.go @@ -0,0 +1,92 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + randomDevMinor = 8 + urandomDevMinor = 9 +) + +// randomDevice implements vfs.Device for /dev/random and /dev/urandom. +type randomDevice struct{} + +// Open implements vfs.Device.Open. +func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &randomFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// randomFD implements vfs.FileDescriptionImpl for /dev/random. +type randomFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // off is the "file offset". off is accessed using atomic memory + // operations. + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *randomFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *randomFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *randomFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) + atomic.AddInt64(&fd.off, n) + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *randomFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // In Linux, this mixes the written bytes into the entropy pool; we just + // throw them away. + return src.NumBytes(), nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *randomFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + atomic.AddInt64(&fd.off, src.NumBytes()) + return src.NumBytes(), nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *randomFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Linux: drivers/char/random.c:random_fops.llseek == urandom_fops.llseek + // == noop_llseek + return atomic.LoadInt64(&fd.off), nil +} diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go new file mode 100644 index 000000000..3b1372b9e --- /dev/null +++ b/pkg/sentry/devices/memdev/zero.go @@ -0,0 +1,88 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memdev + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +const zeroDevMinor = 5 + +// zeroDevice implements vfs.Device for /dev/zero. +type zeroDevice struct{} + +// Open implements vfs.Device.Open. +func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &zeroFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// zeroFD implements vfs.FileDescriptionImpl for /dev/zero. +type zeroFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *zeroFD) Release() { + // noop +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *zeroFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *zeroFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return dst.ZeroOut(ctx, dst.NumBytes()) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *zeroFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *zeroFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return src.NumBytes(), nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) + if err != nil { + return err + } + opts.MappingIdentity = m + opts.Mappable = m + return nil +} diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index f6c79e51b..b060a12ff 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -401,7 +401,7 @@ func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error if err != nil { return err } - lowerXattr, err := lower.ListXattr(ctx) + lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX) if err != nil && err != syserror.EOPNOTSUPP { return err } diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 4c4b7d5cc..9379a4d7b 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -9,6 +9,7 @@ go_library( "device.go", "fs.go", "full.go", + "net_tun.go", "null.go", "random.go", "tty.go", @@ -19,15 +20,20 @@ go_library( "//pkg/context", "//pkg/rand", "//pkg/safemem", + "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/pgalloc", + "//pkg/sentry/socket/netstack", "//pkg/syserror", + "//pkg/tcpip/link/tun", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index 35bd23991..acbd401a0 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/usermem" ) @@ -66,8 +67,8 @@ func newMemDevice(ctx context.Context, iops fs.InodeOperations, msrc *fs.MountSo }) } -func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode { - iops := ramfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) +func newDirectory(ctx context.Context, contents map[string]*fs.Inode, msrc *fs.MountSource) *fs.Inode { + iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ DeviceID: devDevice.DeviceID(), InodeID: devDevice.NextIno(), @@ -111,7 +112,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { // A devpts is typically mounted at /dev/pts to provide // pseudoterminal support. Place an empty directory there for // the devpts to be mounted over. - "pts": newDirectory(ctx, msrc), + "pts": newDirectory(ctx, nil, msrc), // Similarly, applications expect a ptmx device at /dev/ptmx // connected to the terminals provided by /dev/pts/. Rather // than creating a device directly (which requires a hairy @@ -126,6 +127,12 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { "tty": newCharacterDevice(ctx, newTTYDevice(ctx, fs.RootOwner, 0666), msrc, ttyDevMajor, ttyDevMinor), } + if isNetTunSupported(inet.StackFromContext(ctx)) { + contents["net"] = newDirectory(ctx, map[string]*fs.Inode{ + "tun": newCharacterDevice(ctx, newNetTunDevice(ctx, fs.RootOwner, 0666), msrc, netTunDevMajor, netTunDevMinor), + }, msrc) + } + iops := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return fs.NewInode(ctx, iops, msrc, fs.StableAttr{ DeviceID: devDevice.DeviceID(), diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go new file mode 100644 index 000000000..dc7ad075a --- /dev/null +++ b/pkg/sentry/fs/dev/net_tun.go @@ -0,0 +1,177 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/link/tun" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + netTunDevMajor = 10 + netTunDevMinor = 200 +) + +// +stateify savable +type netTunInodeOperations struct { + fsutil.InodeGenericChecker `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNoopAllocate `state:"nosave"` + fsutil.InodeNoopRelease `state:"nosave"` + fsutil.InodeNoopTruncate `state:"nosave"` + fsutil.InodeNoopWriteOut `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotMappable `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + fsutil.InodeVirtual `state:"nosave"` + + fsutil.InodeSimpleAttributes +} + +var _ fs.InodeOperations = (*netTunInodeOperations)(nil) + +func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *netTunInodeOperations { + return &netTunInodeOperations{ + InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, fs.FilePermsFromMode(mode), linux.TMPFS_MAGIC), + } +} + +// GetFile implements fs.InodeOperations.GetFile. +func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil +} + +// +stateify savable +type netTunFileOperations struct { + fsutil.FileNoSeek `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + device tun.Device +} + +var _ fs.FileOperations = (*netTunFileOperations)(nil) + +// Release implements fs.FileOperations.Release. +func (fops *netTunFileOperations) Release() { + fops.device.Release() +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + request := args[1].Uint() + data := args[2].Pointer() + + switch request { + case linux.TUNSETIFF: + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("Ioctl should be called from a task context") + } + if !t.HasCapability(linux.CAP_NET_ADMIN) { + return 0, syserror.EPERM + } + stack, ok := t.NetworkContext().(*netstack.Stack) + if !ok { + return 0, syserror.EINVAL + } + + var req linux.IFReq + if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + flags := usermem.ByteOrder.Uint16(req.Data[:]) + return 0, fops.device.SetIff(stack.Stack, req.Name(), flags) + + case linux.TUNGETIFF: + var req linux.IFReq + + copy(req.IFName[:], fops.device.Name()) + + // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when + // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c. + flags := fops.device.Flags() | linux.IFF_NOFILTER + usermem.ByteOrder.PutUint16(req.Data[:], flags) + + _, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +// Write implements fs.FileOperations.Write. +func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + data := make([]byte, src.NumBytes()) + if _, err := src.CopyIn(ctx, data); err != nil { + return 0, err + } + return fops.device.Write(data) +} + +// Read implements fs.FileOperations.Read. +func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + data, err := fops.device.Read() + if err != nil { + return 0, err + } + n, err := dst.CopyOut(ctx, data) + if n > 0 && n < len(data) { + // Not an error for partial copying. Packet truncated. + err = nil + } + return int64(n), err +} + +// Readiness implements watier.Waitable.Readiness. +func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return fops.device.Readiness(mask) +} + +// EventRegister implements watier.Waitable.EventRegister. +func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fops.device.EventRegister(e, mask) +} + +// EventUnregister implements watier.Waitable.EventUnregister. +func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) { + fops.device.EventUnregister(e) +} + +// isNetTunSupported returns whether /dev/net/tun device is supported for s. +func isNetTunSupported(s inet.Stack) bool { + _, ok := s.(*netstack.Stack) + return ok +} diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index acab0411a..e0b32e1c1 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -1438,8 +1438,8 @@ func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName }, nil } -func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error { - uattr, err := dir.Inode.UnstableAttr(ctx) +func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { + uattr, err := d.Inode.UnstableAttr(ctx) if err != nil { return syserror.EPERM } @@ -1465,30 +1465,33 @@ func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error { return syserror.EPERM } -// MayDelete determines whether `name`, a child of `dir`, can be deleted or +// MayDelete determines whether `name`, a child of `d`, can be deleted or // renamed by `ctx`. // // Compare Linux kernel fs/namei.c:may_delete. -func MayDelete(ctx context.Context, root, dir *Dirent, name string) error { - if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { +func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error { + if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { return err } - victim, err := dir.Walk(ctx, root, name) + unlock := d.lockDirectory() + defer unlock() + + victim, err := d.walk(ctx, root, name, true /* may unlock */) if err != nil { return err } defer victim.DecRef() - return mayDelete(ctx, dir, victim) + return d.mayDelete(ctx, victim) } // mayDelete determines whether `victim`, a child of `dir`, can be deleted or // renamed by `ctx`. // // Preconditions: `dir` is writable and executable by `ctx`. -func mayDelete(ctx context.Context, dir, victim *Dirent) error { - if err := checkSticky(ctx, dir, victim); err != nil { +func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error { + if err := d.checkSticky(ctx, victim); err != nil { return err } @@ -1542,7 +1545,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string defer renamed.DecRef() // Check that the renamed dirent is deletable. - if err := mayDelete(ctx, oldParent, renamed); err != nil { + if err := oldParent.mayDelete(ctx, renamed); err != nil { return err } @@ -1580,7 +1583,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // across the Rename, so must call DecRef manually (no defer). // Check that we can delete replaced. - if err := mayDelete(ctx, newParent, replaced); err != nil { + if err := newParent.mayDelete(ctx, replaced); err != nil { replaced.DecRef() return err } diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 25514ace4..33de32c69 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -101,8 +101,6 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.SetPrev(nil) - d.SetNext(nil) d.DecRef() c.currentSize-- if c.limit != nil { diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go index 02538bb4f..a76d87e3a 100644 --- a/pkg/sentry/fs/file_overlay_test.go +++ b/pkg/sentry/fs/file_overlay_test.go @@ -177,6 +177,7 @@ func TestReaddirRevalidation(t *testing.T) { // TestReaddirOverlayFrozen tests that calling Readdir on an overlay file with // a frozen dirent tree does not make Readdir calls to the underlying files. +// This is a regression test for b/114808269. func TestReaddirOverlayFrozen(t *testing.T) { ctx := contexttest.Context(t) diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 4ab2a384f..789369220 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -28,13 +28,13 @@ go_template_instance( "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", - prefix = "frameRef", + prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", "Range": "platform.FileRange", "Value": "uint64", - "Functions": "frameRefSetFunctions", + "Functions": "FrameRefSetFunctions", }, ) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd63db32b..6564fd0c6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -20,24 +20,25 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" ) -type frameRefSetFunctions struct{} +// FrameRefSetFunctions implements segment.Functions for FrameRefSet. +type FrameRefSetFunctions struct{} // MinKey implements segment.Functions.MinKey. -func (frameRefSetFunctions) MinKey() uint64 { +func (FrameRefSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. -func (frameRefSetFunctions) MaxKey() uint64 { +func (FrameRefSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. -func (frameRefSetFunctions) ClearValue(val *uint64) { +func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -45,6 +46,6 @@ func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index 67278aa86..e82afd112 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -65,13 +65,18 @@ type mapping struct { writable bool } -// NewHostFileMapper returns a HostFileMapper with no references or cached -// mappings. +// Init must be called on zero-value HostFileMappers before first use. +func (f *HostFileMapper) Init() { + f.refs = make(map[uint64]int32) + f.mappings = make(map[uint64]mapping) +} + +// NewHostFileMapper returns an initialized HostFileMapper allocated on the +// heap with no references or cached mappings. func NewHostFileMapper() *HostFileMapper { - return &HostFileMapper{ - refs: make(map[uint64]int32), - mappings: make(map[uint64]mapping), - } + f := &HostFileMapper{} + f.Init() + return f } // IncRefOn increments the reference count on all offsets in mr. diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 252830572..1922ff08c 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -247,7 +247,7 @@ func (i *InodeSimpleExtendedAttributes) SetXattr(_ context.Context, _ *fs.Inode, } // ListXattr implements fs.InodeOperations.ListXattr. -func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { +func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) { i.mu.RLock() names := make(map[string]struct{}, len(i.xattrs)) for name := range i.xattrs { @@ -257,6 +257,17 @@ func (i *InodeSimpleExtendedAttributes) ListXattr(context.Context, *fs.Inode) (m return names, nil } +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (i *InodeSimpleExtendedAttributes) RemoveXattr(_ context.Context, _ *fs.Inode, name string) error { + i.mu.Lock() + defer i.mu.Unlock() + if _, ok := i.xattrs[name]; ok { + delete(i.xattrs, name) + return nil + } + return syserror.ENOATTR +} + // staticFile is a file with static contents. It is returned by // InodeStaticFileGetter.GetFile. // @@ -460,10 +471,15 @@ func (InodeNoExtendedAttributes) SetXattr(context.Context, *fs.Inode, string, st } // ListXattr implements fs.InodeOperations.ListXattr. -func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { +func (InodeNoExtendedAttributes) ListXattr(context.Context, *fs.Inode, uint64) (map[string]struct{}, error) { return nil, syserror.EOPNOTSUPP } +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (InodeNoExtendedAttributes) RemoveXattr(context.Context, *fs.Inode, string) error { + return syserror.EOPNOTSUPP +} + // InodeNoopRelease implements fs.InodeOperations.Release as a noop. type InodeNoopRelease struct{} diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 573b8586e..800c8b4e1 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -111,7 +111,7 @@ type CachingInodeOperations struct { // refs tracks active references to data in the cache. // // refs is protected by dataMu. - refs frameRefSet + refs FrameRefSet } // CachingInodeOperationsOptions configures a CachingInodeOperations. diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md index 71a577d9d..85063d4e6 100644 --- a/pkg/sentry/fs/g3doc/inotify.md +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -112,11 +112,11 @@ attempts to queue a new event, it is already holding `fs.Watches.mu`. If we used `Inotify.mu` to also protect the event queue, this would violate the above lock ordering. -[dirent]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/dirent.go -[event]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_event.go -[fd_table]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/kernel/fd_table.go -[inode]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode.go -[inode_watches]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inode_inotify.go -[inotify]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify.go -[syscall_dir]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/syscalls/linux/ -[watch]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/fs/inotify_watch.go +[dirent]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/dirent.go +[event]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_event.go +[fd_table]: https://github.com/google/gvisor/blob/master/pkg/sentry/kernel/fd_table.go +[inode]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode.go +[inode_watches]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inode_inotify.go +[inotify]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify.go +[syscall_dir]: https://github.com/google/gvisor/blob/master/pkg/sentry/syscalls/linux/ +[watch]: https://github.com/google/gvisor/blob/master/pkg/sentry/fs/inotify_watch.go diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index 971d3718e..fea135eea 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -9,6 +9,7 @@ go_library( "cache_policy.go", "context_file.go", "device.go", + "fifo.go", "file.go", "file_state.go", "fs.go", @@ -38,6 +39,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/host", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index 71cccdc34..6db4b762d 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -88,8 +88,9 @@ func bsize(pattr p9.Attr) int64 { if pattr.BlockSize > 0 { return int64(pattr.BlockSize) } - // Some files may have no clue of their block size. Better not to report - // something misleading or buggy and have a safe default. + // Some files, particularly those that are not on a local file system, + // may have no clue of their block size. Better not to report something + // misleading or buggy and have a safe default. return usermem.PageSize } @@ -149,6 +150,7 @@ func links(valid p9.AttrMask, pattr p9.Attr) uint64 { } // This node is likely backed by a file system that doesn't support links. + // // We could readdir() and count children directories to provide an accurate // link count. However this may be expensive since the gofer may be backed by remote // storage. Instead, simply return 2 links for directories and 1 for everything else diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go index ebea03c42..07a564e92 100644 --- a/pkg/sentry/fs/gofer/cache_policy.go +++ b/pkg/sentry/fs/gofer/cache_policy.go @@ -127,6 +127,9 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child childIops, ok := child.InodeOperations.(*inodeOperations) if !ok { + if _, ok := child.InodeOperations.(*fifo); ok { + return false + } panic(fmt.Sprintf("revalidating inode operations of unknown type %T", child.InodeOperations)) } parentIops, ok := parent.InodeOperations.(*inodeOperations) diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go index 3da818aed..125907d70 100644 --- a/pkg/sentry/fs/gofer/context_file.go +++ b/pkg/sentry/fs/gofer/context_file.go @@ -73,6 +73,20 @@ func (c *contextFile) setXattr(ctx context.Context, name, value string, flags ui return err } +func (c *contextFile) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := c.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + +func (c *contextFile) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := c.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (c *contextFile) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := c.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fs/gofer/fifo.go b/pkg/sentry/fs/gofer/fifo.go new file mode 100644 index 000000000..456557058 --- /dev/null +++ b/pkg/sentry/fs/gofer/fifo.go @@ -0,0 +1,40 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" +) + +// +stateify savable +type fifo struct { + fs.InodeOperations + fileIops *inodeOperations +} + +var _ fs.InodeOperations = (*fifo)(nil) + +// Rename implements fs.InodeOperations. It forwards the call to the underlying +// file inode to handle the file rename. Note that file key remains the same +// after the rename to keep the endpoint mapping. +func (i *fifo) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error { + return i.fileIops.Rename(ctx, inode, oldParent, oldName, newParent, newName, replacement) +} + +// StatFS implements fs.InodeOperations. +func (i *fifo) StatFS(ctx context.Context) (fs.Info, error) { + return i.fileIops.StatFS(ctx) +} diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 0c2f89ae8..2df2fe889 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -61,7 +61,7 @@ func rootTest(t *testing.T, name string, cp cachePolicy, fn func(context.Context ctx := contexttest.Context(t) sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{ file: rootFile, - }, root.QID, p9.AttrMaskAll(), root.Attr, false /* socket */) + }, root.QID, p9.AttrMaskAll(), root.Attr) m := fs.NewMountSource(ctx, s, &filesystem{}, fs.MountSourceFlags{}) rootInode := fs.NewInode(ctx, rootInodeOperations, m, sattr) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index ac28174d2..1c934981b 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -604,18 +604,23 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length } // GetXattr implements fs.InodeOperations.GetXattr. -func (i *inodeOperations) GetXattr(ctx context.Context, inode *fs.Inode, name string, size uint64) (string, error) { +func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) { return i.fileState.file.getXattr(ctx, name, size) } // SetXattr implements fs.InodeOperations.SetXattr. -func (i *inodeOperations) SetXattr(ctx context.Context, inode *fs.Inode, name string, value string, flags uint32) error { +func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error { return i.fileState.file.setXattr(ctx, name, value, flags) } // ListXattr implements fs.InodeOperations.ListXattr. -func (i *inodeOperations) ListXattr(context.Context, *fs.Inode) (map[string]struct{}, error) { - return nil, syscall.EOPNOTSUPP +func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) { + return i.fileState.file.listXattr(ctx, size) +} + +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error { + return i.fileState.file.removeXattr(ctx, name) } // Allocate implements fs.InodeOperations.Allocate. diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 0c1be05ef..a35c3a23d 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -23,14 +23,24 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // maxFilenameLen is the maximum length of a filename. This is dictated by 9P's // encoding of strings, which uses 2 bytes for the length prefix. const maxFilenameLen = (1 << 16) - 1 +func changeType(mode p9.FileMode, newType p9.FileMode) p9.FileMode { + if newType&^p9.FileModeMask != 0 { + panic(fmt.Sprintf("newType contained more bits than just file mode: %x", newType)) + } + clear := mode &^ p9.FileModeMask + return clear | newType +} + // Lookup loads an Inode at name into a Dirent based on the session's cache // policy. func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { @@ -69,8 +79,25 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string return nil, err } + if i.session().overrides != nil { + // Check if file belongs to a internal named pipe. Note that it doesn't need + // to check for sockets because it's done in newInodeOperations below. + deviceKey := device.MultiDeviceKey{ + Device: p9attr.RDev, + SecondaryDevice: i.session().connID, + Inode: qids[0].Path, + } + unlock := i.session().overrides.lock() + if pipeInode := i.session().overrides.getPipe(deviceKey); pipeInode != nil { + unlock() + pipeInode.IncRef() + return fs.NewDirent(ctx, pipeInode, name), nil + } + unlock() + } + // Construct the Inode operations. - sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr, false) + sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr) // Construct a positive Dirent. return fs.NewDirent(ctx, fs.NewInode(ctx, node, dir.MountSource, sattr), name), nil @@ -138,7 +165,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string qid := qids[0] // Construct the InodeOperations. - sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr, false) + sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr) // Construct the positive Dirent. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) @@ -223,82 +250,115 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, return nil, syserror.ENAMETOOLONG } - if i.session().endpoints == nil { + if i.session().overrides == nil { return nil, syscall.EOPNOTSUPP } - // Create replaces the directory fid with the newly created/opened - // file, so clone this directory so it doesn't change out from under - // this node. - _, newFile, err := i.fileState.file.walk(ctx, nil) + // Stabilize the override map while creation is in progress. + unlock := i.session().overrides.lock() + defer unlock() + + sattr, iops, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeSocket) if err != nil { return nil, err } - // We're not going to use newFile after return. - defer newFile.close(ctx) - // Stabilize the endpoint map while creation is in progress. - unlock := i.session().endpoints.lock() - defer unlock() + // Construct the positive Dirent. + childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) + i.session().overrides.addBoundEndpoint(iops.fileState.key, childDir, ep) + return childDir, nil +} - // Create a regular file in the gofer and then mark it as a socket by - // adding this inode key in the 'endpoints' map. - owner := fs.FileOwnerFromContext(ctx) - hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) - if err != nil { - return nil, err +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + if len(name) > maxFilenameLen { + return syserror.ENAMETOOLONG } - // We're not going to use this file. - hostFile.Close() - i.touchModificationAndStatusChangeTime(ctx, dir) + owner := fs.FileOwnerFromContext(ctx) + mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe - // Get the attributes of the file to create inode key. - qid, mask, attr, err := getattr(ctx, newFile) - if err != nil { - return nil, err + // N.B. FIFOs use major/minor numbers 0. + if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + if i.session().overrides == nil || err != syscall.EPERM { + return err + } + // If gofer doesn't support mknod, check if we can create an internal fifo. + return i.createInternalFifo(ctx, dir, name, owner, perm) } - key := device.MultiDeviceKey{ - Device: attr.RDev, - SecondaryDevice: i.session().connID, - Inode: qid.Path, + i.touchModificationAndStatusChangeTime(ctx, dir) + return nil +} + +func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode, name string, owner fs.FileOwner, perm fs.FilePermissions) error { + if i.session().overrides == nil { + return syserror.EPERM } - // Create child dirent. + // Stabilize the override map while creation is in progress. + unlock := i.session().overrides.lock() + defer unlock() - // Get an unopened p9.File for the file we created so that it can be - // cloned and re-opened multiple times after creation. - _, unopened, err := i.fileState.file.walk(ctx, []string{name}) + sattr, fileOps, err := i.createEndpointFile(ctx, dir, name, perm, p9.ModeNamedPipe) if err != nil { - return nil, err + return err } - // Construct the InodeOperations. - sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr, true) + // First create a pipe. + p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize) + + // Wrap the fileOps with our Fifo. + iops := &fifo{ + InodeOperations: pipe.NewInodeOperations(ctx, perm, p), + fileIops: fileOps, + } + inode := fs.NewInode(ctx, iops, dir.MountSource, sattr) // Construct the positive Dirent. childDir := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - i.session().endpoints.add(key, childDir, ep) - return childDir, nil + i.session().overrides.addPipe(fileOps.fileState.key, childDir, inode) + return nil } -// CreateFifo implements fs.InodeOperations.CreateFifo. -func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { - if len(name) > maxFilenameLen { - return syserror.ENAMETOOLONG +// Caller must hold Session.endpoint lock. +func (i *inodeOperations) createEndpointFile(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions, fileType p9.FileMode) (fs.StableAttr, *inodeOperations, error) { + _, dirClone, err := i.fileState.file.walk(ctx, nil) + if err != nil { + return fs.StableAttr{}, nil, err } + // We're not going to use dirClone after return. + defer dirClone.close(ctx) + // Create a regular file in the gofer and then mark it as a socket by + // adding this inode key in the 'overrides' map. owner := fs.FileOwnerFromContext(ctx) - mode := p9.FileMode(perm.LinuxMode()) | p9.ModeNamedPipe - - // N.B. FIFOs use major/minor numbers 0. - if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { - return err + hostFile, err := dirClone.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) + if err != nil { + return fs.StableAttr{}, nil, err } + // We're not going to use this file. + hostFile.Close() i.touchModificationAndStatusChangeTime(ctx, dir) - return nil + + // Get the attributes of the file to create inode key. + qid, mask, attr, err := getattr(ctx, dirClone) + if err != nil { + return fs.StableAttr{}, nil, err + } + + // Get an unopened p9.File for the file we created so that it can be + // cloned and re-opened multiple times after creation. + _, unopened, err := i.fileState.file.walk(ctx, []string{name}) + if err != nil { + return fs.StableAttr{}, nil, err + } + + // Construct new inode with file type overridden. + attr.Mode = changeType(attr.Mode, fileType) + sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, attr) + return sattr, iops, nil } // Remove implements InodeOperations.Remove. @@ -307,20 +367,23 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return syserror.ENAMETOOLONG } - var key device.MultiDeviceKey - removeSocket := false - if i.session().endpoints != nil { - // Find out if file being deleted is a socket that needs to be + var key *device.MultiDeviceKey + if i.session().overrides != nil { + // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { defer d.DecRef() - if fs.IsSocket(d.Inode.StableAttr) { - child := d.Inode.InodeOperations.(*inodeOperations) - key = child.fileState.key - removeSocket = true - // Stabilize the endpoint map while deletion is in progress. - unlock := i.session().endpoints.lock() + if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) { + switch iops := d.Inode.InodeOperations.(type) { + case *inodeOperations: + key = &iops.fileState.key + case *fifo: + key = &iops.fileIops.fileState.key + } + + // Stabilize the override map while deletion is in progress. + unlock := i.session().overrides.lock() defer unlock() } } @@ -329,8 +392,8 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil { return err } - if removeSocket { - i.session().endpoints.remove(key) + if key != nil { + i.session().overrides.remove(*key) } i.touchModificationAndStatusChangeTime(ctx, dir) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 498c4645a..f6b3ef178 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -33,60 +33,107 @@ import ( var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize // +stateify savable -type endpointMaps struct { - // mu protexts the direntMap, the keyMap, and the pathMap below. - mu sync.RWMutex `state:"nosave"` +type overrideInfo struct { + dirent *fs.Dirent + + // endpoint is set when dirent points to a socket. inode must not be set. + endpoint transport.BoundEndpoint + + // inode is set when dirent points to a pipe. endpoint must not be set. + inode *fs.Inode +} - // direntMap links sockets to their dirents. - // It is filled concurrently with the keyMap and is stored upon save. - // Before saving, this map is used to populate the pathMap. - direntMap map[transport.BoundEndpoint]*fs.Dirent +func (l *overrideInfo) inodeType() fs.InodeType { + switch { + case l.endpoint != nil: + return fs.Socket + case l.inode != nil: + return fs.Pipe + } + panic("endpoint or node must be set") +} - // keyMap links MultiDeviceKeys (containing inode IDs) to their sockets. +// +stateify savable +type overrideMaps struct { + // mu protexts the keyMap, and the pathMap below. + mu sync.RWMutex `state:"nosave"` + + // keyMap links MultiDeviceKeys (containing inode IDs) to their sockets/pipes. // It is not stored during save because the inode ID may change upon restore. - keyMap map[device.MultiDeviceKey]transport.BoundEndpoint `state:"nosave"` + keyMap map[device.MultiDeviceKey]*overrideInfo `state:"nosave"` - // pathMap links the sockets to their paths. + // pathMap links the sockets/pipes to their paths. // It is filled before saving from the direntMap and is stored upon save. // Upon restore, this map is used to re-populate the keyMap. - pathMap map[transport.BoundEndpoint]string + pathMap map[*overrideInfo]string +} + +// addBoundEndpoint adds the bound endpoint to the map. +// A reference is taken on the dirent argument. +// +// Precondition: maps must have been locked with 'lock'. +func (e *overrideMaps) addBoundEndpoint(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) { + d.IncRef() + e.keyMap[key] = &overrideInfo{dirent: d, endpoint: ep} } -// add adds the endpoint to the maps. +// addPipe adds the pipe inode to the map. // A reference is taken on the dirent argument. // // Precondition: maps must have been locked with 'lock'. -func (e *endpointMaps) add(key device.MultiDeviceKey, d *fs.Dirent, ep transport.BoundEndpoint) { - e.keyMap[key] = ep +func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *fs.Inode) { d.IncRef() - e.direntMap[ep] = d + e.keyMap[key] = &overrideInfo{dirent: d, inode: inode} } // remove deletes the key from the maps. // // Precondition: maps must have been locked with 'lock'. -func (e *endpointMaps) remove(key device.MultiDeviceKey) { - endpoint := e.get(key) +func (e *overrideMaps) remove(key device.MultiDeviceKey) { + endpoint := e.keyMap[key] delete(e.keyMap, key) - - d := e.direntMap[endpoint] - d.DecRef() - delete(e.direntMap, endpoint) + endpoint.dirent.DecRef() } // lock blocks other addition and removal operations from happening while // the backing file is being created or deleted. Returns a function that unlocks // the endpoint map. -func (e *endpointMaps) lock() func() { +func (e *overrideMaps) lock() func() { e.mu.Lock() return func() { e.mu.Unlock() } } -// get returns the endpoint mapped to the given key. +// getBoundEndpoint returns the bound endpoint mapped to the given key. // -// Precondition: maps must have been locked for reading. -func (e *endpointMaps) get(key device.MultiDeviceKey) transport.BoundEndpoint { - return e.keyMap[key] +// Precondition: maps must have been locked. +func (e *overrideMaps) getBoundEndpoint(key device.MultiDeviceKey) transport.BoundEndpoint { + if v := e.keyMap[key]; v != nil { + return v.endpoint + } + return nil +} + +// getPipe returns the pipe inode mapped to the given key. +// +// Precondition: maps must have been locked. +func (e *overrideMaps) getPipe(key device.MultiDeviceKey) *fs.Inode { + if v := e.keyMap[key]; v != nil { + return v.inode + } + return nil +} + +// getType returns the inode type if there is a corresponding endpoint for the +// given key. Returns false otherwise. +func (e *overrideMaps) getType(key device.MultiDeviceKey) (fs.InodeType, bool) { + e.mu.Lock() + v := e.keyMap[key] + e.mu.Unlock() + + if v != nil { + return v.inodeType(), true + } + return 0, false } // session holds state for each 9p session established during sys_mount. @@ -137,16 +184,16 @@ type session struct { // mounter is the EUID/EGID that mounted this file system. mounter fs.FileOwner `state:"wait"` - // endpoints is used to map inodes that represent socket files to their - // corresponding endpoint. Socket files are created as regular files in the - // gofer and their presence in this map indicate that they should indeed be - // socket files. This allows unix domain sockets to be used with paths that - // belong to a gofer. + // overrides is used to map inodes that represent socket/pipes files to their + // corresponding endpoint/iops. These files are created as regular files in + // the gofer and their presence in this map indicate that they should indeed + // be socket/pipe files. This allows unix domain sockets and named pipes to + // be used with paths that belong to a gofer. // // TODO(gvisor.dev/issue/1200): there are few possible races with someone // stat'ing the file and another deleting it concurrently, where the file // will not be reported as socket file. - endpoints *endpointMaps `state:"wait"` + overrides *overrideMaps `state:"wait"` } // Destroy tears down the session. @@ -179,15 +226,21 @@ func (s *session) SaveInodeMapping(inode *fs.Inode, path string) { // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, // because overlay copyUp may have changed them out from under us. // So much for "immutable". - sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr - s.inodeMappings[sattr.InodeID] = path + switch iops := inode.InodeOperations.(type) { + case *inodeOperations: + s.inodeMappings[iops.fileState.sattr.InodeID] = path + case *fifo: + s.inodeMappings[iops.fileIops.fileState.sattr.InodeID] = path + default: + panic(fmt.Sprintf("Invalid type: %T", iops)) + } } -// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes -// (p9.QID, p9.AttrMask, p9.Attr). +// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File +// and attributes (p9.QID, p9.AttrMask, p9.Attr). // // Endpoints lock must not be held if socket == false. -func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr, socket bool) (fs.StableAttr, *inodeOperations) { +func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) { deviceKey := device.MultiDeviceKey{ Device: attr.RDev, SecondaryDevice: s.connID, @@ -201,17 +254,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p BlockSize: bsize(attr), } - if s.endpoints != nil { - if socket { - sattr.Type = fs.Socket - } else { - // If unix sockets are allowed on this filesystem, check if this file is - // supposed to be a socket file. - unlock := s.endpoints.lock() - if s.endpoints.get(deviceKey) != nil { - sattr.Type = fs.Socket - } - unlock() + if s.overrides != nil && sattr.Type == fs.RegularFile { + // If overrides are allowed on this filesystem, check if this file is + // supposed to be of a different type, e.g. socket. + if t, ok := s.overrides.getType(deviceKey); ok { + sattr.Type = t } } @@ -267,7 +314,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF s.EnableLeakCheck("gofer.session") if o.privateunixsocket { - s.endpoints = newEndpointMaps() + s.overrides = newOverrideMaps() } // Construct the MountSource with the session and superBlockFlags. @@ -305,26 +352,24 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF return nil, err } - sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr, false) + sattr, iops := newInodeOperations(ctx, &s, s.attach, qid, valid, attr) return fs.NewInode(ctx, iops, m, sattr), nil } -// newEndpointMaps creates a new endpointMaps. -func newEndpointMaps() *endpointMaps { - return &endpointMaps{ - direntMap: make(map[transport.BoundEndpoint]*fs.Dirent), - keyMap: make(map[device.MultiDeviceKey]transport.BoundEndpoint), - pathMap: make(map[transport.BoundEndpoint]string), +// newOverrideMaps creates a new overrideMaps. +func newOverrideMaps() *overrideMaps { + return &overrideMaps{ + keyMap: make(map[device.MultiDeviceKey]*overrideInfo), + pathMap: make(map[*overrideInfo]string), } } -// fillKeyMap populates key and dirent maps upon restore from saved -// pathmap. +// fillKeyMap populates key and dirent maps upon restore from saved pathmap. func (s *session) fillKeyMap(ctx context.Context) error { - unlock := s.endpoints.lock() + unlock := s.overrides.lock() defer unlock() - for ep, dirPath := range s.endpoints.pathMap { + for ep, dirPath := range s.overrides.pathMap { _, file, err := s.attach.walk(ctx, splitAbsolutePath(dirPath)) if err != nil { return fmt.Errorf("error filling endpointmaps, failed to walk to %q: %v", dirPath, err) @@ -341,25 +386,25 @@ func (s *session) fillKeyMap(ctx context.Context) error { Inode: qid.Path, } - s.endpoints.keyMap[key] = ep + s.overrides.keyMap[key] = ep } return nil } -// fillPathMap populates paths for endpoints from dirents in direntMap +// fillPathMap populates paths for overrides from dirents in direntMap // before save. func (s *session) fillPathMap() error { - unlock := s.endpoints.lock() + unlock := s.overrides.lock() defer unlock() - for ep, dir := range s.endpoints.direntMap { - mountRoot := dir.MountRoot() + for _, endpoint := range s.overrides.keyMap { + mountRoot := endpoint.dirent.MountRoot() defer mountRoot.DecRef() - dirPath, _ := dir.FullName(mountRoot) + dirPath, _ := endpoint.dirent.FullName(mountRoot) if dirPath == "" { return fmt.Errorf("error getting path from dirent") } - s.endpoints.pathMap[ep] = dirPath + s.overrides.pathMap[endpoint] = dirPath } return nil } @@ -368,7 +413,7 @@ func (s *session) fillPathMap() error { func (s *session) restoreEndpointMaps(ctx context.Context) error { // When restoring, only need to create the keyMap because the dirent and path // maps got stored through the save. - s.endpoints.keyMap = make(map[device.MultiDeviceKey]transport.BoundEndpoint) + s.overrides.keyMap = make(map[device.MultiDeviceKey]*overrideInfo) if err := s.fillKeyMap(ctx); err != nil { return fmt.Errorf("failed to insert sockets into endpoint map: %v", err) } @@ -376,6 +421,6 @@ func (s *session) restoreEndpointMaps(ctx context.Context) error { // Re-create pathMap because it can no longer be trusted as socket paths can // change while process continues to run. Empty pathMap will be re-filled upon // next save. - s.endpoints.pathMap = make(map[transport.BoundEndpoint]string) + s.overrides.pathMap = make(map[*overrideInfo]string) return nil } diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 0285c5361..111da59f9 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -25,9 +25,9 @@ import ( // beforeSave is invoked by stateify. func (s *session) beforeSave() { - if s.endpoints != nil { + if s.overrides != nil { if err := s.fillPathMap(); err != nil { - panic("failed to save paths to endpoint map before saving" + err.Error()) + panic("failed to save paths to override map before saving" + err.Error()) } } } @@ -74,10 +74,10 @@ func (s *session) afterLoad() { panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname)) } - // Check if endpointMaps exist when uds sockets are enabled - // (only pathmap will actualy have been saved). - if opts.privateunixsocket != (s.endpoints != nil) { - panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil)) + // Check if overrideMaps exist when uds sockets are enabled (only pathmaps + // will actually have been saved). + if opts.privateunixsocket != (s.overrides != nil) { + panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.overrides != nil)) } if args.Flags != s.superBlockFlags { panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags)) diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 376cfce2c..10ba2f5f0 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -32,15 +32,15 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return nil } - if i.session().endpoints != nil { - unlock := i.session().endpoints.lock() + if i.session().overrides != nil { + unlock := i.session().overrides.lock() defer unlock() - ep := i.session().endpoints.get(i.fileState.key) + ep := i.session().overrides.getBoundEndpoint(i.fileState.key) if ep != nil { return ep } - // Not found in endpoints map, it may be a gofer backed unix socket... + // Not found in overrides map, it may be a gofer backed unix socket... } inode.IncRef() diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index e37e687c6..7c60dc1db 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -152,9 +152,9 @@ func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { Usage: s.Blocks * 512, Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), Owner: owner(mo, s), - AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), - ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), - StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + AccessTime: time.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), Links: uint64(s.Nlink), } } @@ -165,6 +165,8 @@ type dirInfo struct { bufp int // location of next record in buf. } +// LINT.IfChange + // isBlockError unwraps os errors and checks if they are caused by EAGAIN or // EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. func isBlockError(err error) bool { @@ -177,6 +179,8 @@ func isBlockError(err error) bool { return false } +// LINT.ThenChange(../../fsimpl/host/util.go) + func hostEffectiveKIDs() (uint32, []uint32, error) { gids, err := os.Getgroups() if err != nil { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index b66c091ab..55fb71c16 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -278,11 +278,19 @@ func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, fla } // ListXattr calls i.InodeOperations.ListXattr with i as the Inode. -func (i *Inode) ListXattr(ctx context.Context) (map[string]struct{}, error) { +func (i *Inode) ListXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { if i.overlay != nil { - return overlayListXattr(ctx, i.overlay) + return overlayListXattr(ctx, i.overlay, size) } - return i.InodeOperations.ListXattr(ctx, i) + return i.InodeOperations.ListXattr(ctx, i, size) +} + +// RemoveXattr calls i.InodeOperations.RemoveXattr with i as the Inode. +func (i *Inode) RemoveXattr(ctx context.Context, d *Dirent, name string) error { + if i.overlay != nil { + return overlayRemoveXattr(ctx, i.overlay, d, name) + } + return i.InodeOperations.RemoveXattr(ctx, i, name) } // CheckPermission will check if the caller may access this file in the diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 70f2eae96..2bbfb72ef 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -190,7 +190,18 @@ type InodeOperations interface { // ListXattr returns the set of all extended attributes names that // have values. Inodes that do not support extended attributes return // EOPNOTSUPP. - ListXattr(ctx context.Context, inode *Inode) (map[string]struct{}, error) + // + // If this is called through the listxattr(2) syscall, size indicates the + // size of the buffer that the application has allocated to hold the + // attribute list. If the list would be larger than size, implementations may + // return ERANGE to indicate that the buffer is too small, but they are also + // free to ignore the hint entirely. All size checking is done independently + // at the syscall layer. + ListXattr(ctx context.Context, inode *Inode, size uint64) (map[string]struct{}, error) + + // RemoveXattr removes an extended attribute specified by name. Inodes that + // do not support extended attributes return EOPNOTSUPP. + RemoveXattr(ctx context.Context, inode *Inode, name string) error // Check determines whether an Inode can be accessed with the // requested permission mask using the context (which gives access diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 4729b4aac..5ada33a32 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -564,15 +564,15 @@ func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, valu return o.upper.SetXattr(ctx, d, name, value, flags) } -func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{}, error) { +func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[string]struct{}, error) { o.copyMu.RLock() defer o.copyMu.RUnlock() var names map[string]struct{} var err error if o.upper != nil { - names, err = o.upper.ListXattr(ctx) + names, err = o.upper.ListXattr(ctx, size) } else { - names, err = o.lower.ListXattr(ctx) + names, err = o.lower.ListXattr(ctx, size) } for name := range names { // Same as overlayGetXattr, we shouldn't forward along @@ -584,6 +584,18 @@ func overlayListXattr(ctx context.Context, o *overlayEntry) (map[string]struct{} return names, err } +func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error { + // Don't allow changes to overlay xattrs through a removexattr syscall. + if strings.HasPrefix(XattrOverlayPrefix, name) { + return syserror.EPERM + } + + if err := copyUp(ctx, d); err != nil { + return err + } + return o.upper.RemoveXattr(ctx, d, name) +} + func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { o.copyMu.RLock() // Hot path. Avoid defers. diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index 928c90aa0..e3a715c1f 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -143,7 +143,10 @@ func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ i } var writeLen int64 - for event := i.events.Front(); event != nil; event = event.Next() { + for it := i.events.Front(); it != nil; { + event := it + it = it.Next() + // Does the buffer have enough remaining space to hold the event we're // about to write out? if dst.NumBytes() < int64(event.sizeOf()) { diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index e672a438c..a3d10770b 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -36,11 +36,12 @@ func mountPathsAre(root *Dirent, got []*Mount, want ...string) error { gotPaths := make(map[string]struct{}, len(got)) gotStr := make([]string, len(got)) for i, g := range got { - groot := g.Root() - name, _ := groot.FullName(root) - groot.DecRef() - gotStr[i] = name - gotPaths[name] = struct{}{} + if groot := g.Root(); groot != nil { + name, _ := groot.FullName(root) + groot.DecRef() + gotStr[i] = name + gotPaths[name] = struct{}{} + } } if len(got) != len(want) { return fmt.Errorf("mount paths are different, got: %q, want: %q", gotStr, want) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 574a2cc91..c7981f66e 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -100,10 +100,14 @@ func newUndoMount(d *Dirent) *Mount { } } -// Root returns the root dirent of this mount. Callers must call DecRef on the -// returned dirent. +// Root returns the root dirent of this mount. +// +// This may return nil if the mount has already been free. Callers must handle this +// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent. func (m *Mount) Root() *Dirent { - m.root.IncRef() + if !m.root.TryIncRef() { + return nil + } return m.root } diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 280093c5e..77c2c5c0e 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/sentry/fs/proc/device", "//pkg/sentry/fs/proc/seqfile", "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md index 5d4ec6c7b..6667a0916 100644 --- a/pkg/sentry/fs/proc/README.md +++ b/pkg/sentry/fs/proc/README.md @@ -11,6 +11,8 @@ inconsistency, please file a bug. The following files are implemented: +<!-- mdformat off(don't wrap the table) --> + | File /proc/ | Content | | :------------------------ | :---------------------------------------------------- | | [cpuinfo](#cpuinfo) | Info about the CPU | @@ -22,6 +24,8 @@ The following files are implemented: | [uptime](#uptime) | Wall clock since boot, combined idle time of all cpus | | [version](#version) | Kernel version | +<!-- mdformat on --> + ### cpuinfo ```bash diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index c10888100..94deb553b 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -60,13 +60,15 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { }) for _, m := range ms { mroot := m.Root() + if mroot == nil { + continue // No longer valid. + } mountPath, desc := mroot.FullName(rootDir) mroot.DecRef() if !desc { // MountSources that are not descendants of the chroot jail are ignored. continue } - fn(mountPath, m) } } @@ -91,6 +93,12 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se var buf bytes.Buffer forEachMount(mif.t, func(mountPath string, m *fs.Mount) { + mroot := m.Root() + if mroot == nil { + return // No longer valid. + } + defer mroot.DecRef() + // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) @@ -107,9 +115,6 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se // (3) Major:Minor device ID. We don't have a superblock, so we // just use the root inode device number. - mroot := m.Root() - defer mroot.DecRef() - sa := mroot.Inode.StableAttr fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) @@ -207,6 +212,9 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan // // The "needs dump"and fsck flags are always 0, which is allowed. root := m.Root() + if root == nil { + return // No longer valid. + } defer root.DecRef() flags := root.Inode.MountSource.Flags diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 6f2775344..95d5817ff 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -43,7 +43,10 @@ import ( // newNet creates a new proc net entry. func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. We should make this per-process, + // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 0772d4ae4..d4c4b533d 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "ipv4": p.newSysNetIPv4Dir(ctx, msrc, s), "core": p.newSysNetCore(ctx, msrc, s), diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ca020e11e..4e9b0fc00 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -71,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - "io": newIO(t, msrc, isThreadGroup), - "maps": newMaps(t, msrc), - "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "ns": newNamespaceDir(t, msrc), - "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), - "statm": newStatm(t, msrc), - "status": newStatus(t, msrc, p.pidns), - "uid_map": newUIDMap(t, msrc), + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgInode(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + "io": newIO(t, msrc, isThreadGroup), + "maps": newMaps(t, msrc), + "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "ns": newNamespaceDir(t, msrc), + "oom_score": newOOMScore(t, msrc), + "oom_score_adj": newOOMScoreAdj(t, msrc), + "smaps": newSmaps(t, msrc), + "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), + "statm": newStatm(t, msrc), + "status": newStatus(t, msrc, p.pidns), + "uid_map": newUIDMap(t, msrc), } if isThreadGroup { contents["task"] = p.newSubtasks(t, msrc) @@ -249,7 +252,7 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { return newProcInode(t, exeSymlink, msrc, fs.Symlink, t) } -func (e *exe) executable() (d *fs.Dirent, err error) { +func (e *exe) executable() (file fsbridge.File, err error) { e.t.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { @@ -262,8 +265,8 @@ func (e *exe) executable() (d *fs.Dirent, err error) { // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). - d = mm.Executable() - if d == nil { + file = mm.Executable() + if file == nil { err = syserror.ENOENT } }) @@ -283,15 +286,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { } defer exec.DecRef() - root := fs.RootFromContext(ctx) - if root == nil { - // This doesn't correspond to anything in Linux because the vfs is - // global there. - return "", syserror.EINVAL - } - defer root.DecRef() - n, _ := exec.FullName(root) - return n, nil + return exec.PathnameWithDeleted(ctx), nil } // namespaceSymlink represents a symlink in the namespacefs, such as the files @@ -803,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc return int64(n), err } +// newOOMScore returns a oom_score file. It is a stub that always returns 0. +// TODO(gvisor.dev/issue/1967) +func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newStaticProcInode(t, msrc, []byte("0\n")) +} + +// oomScoreAdj is a file containing the oom_score adjustment for a task. +// +// +stateify savable +type oomScoreAdj struct { + fsutil.SimpleFileInode + + t *kernel.Task +} + +// +stateify savable +type oomScoreAdjFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + t *kernel.Task +} + +// newOOMScoreAdj returns a oom_score_adj file. +func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + i := &oomScoreAdj{ + SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + t: t, + } + return newProcInode(t, i, msrc, fs.SpecialFile, t) +} + +// Truncate implements fs.InodeOperations.Truncate. Truncate is called when +// O_TRUNC is specified for any kind of existing Dirent but is not called via +// (f)truncate for proc files. +func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil +} + +// Read implements fs.FileOperations.Read. +func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + adj, err := f.t.OOMScoreAdj() + if err != nil { + return 0, err + } + adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n") + n, err := dst.CopyOut(ctx, adjBytes) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if err := f.t.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} + // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index dabc10662..25abbc151 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -17,6 +17,7 @@ package tmpfs import ( "fmt" "io" + "math" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -444,10 +445,15 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) defer rw.f.dataMu.Unlock() // Compute the range to write. - end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) - if end == rw.offset { // srcs.NumBytes() == 0? + if srcs.NumBytes() == 0 { + // Nothing to do. return 0, nil } + end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) + if end == math.MaxInt64 { + // Overflow. + return 0, syserror.EINVAL + } // Check if seals prevent either file growth or all writes. switch { diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index c00cef0a5..3c2b583ae 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -159,8 +159,13 @@ func (d *Dir) SetXattr(ctx context.Context, i *fs.Inode, name, value string, fla } // ListXattr implements fs.InodeOperations.ListXattr. -func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode) (map[string]struct{}, error) { - return d.ramfsDir.ListXattr(ctx, i) +func (d *Dir) ListXattr(ctx context.Context, i *fs.Inode, size uint64) (map[string]struct{}, error) { + return d.ramfsDir.ListXattr(ctx, i, size) +} + +// RemoveXattr implements fs.InodeOperations.RemoveXattr. +func (d *Dir) RemoveXattr(ctx context.Context, i *fs.Inode, name string) error { + return d.ramfsDir.RemoveXattr(ctx, i, name) } // Lookup implements fs.InodeOperations.Lookup. diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index db55cdc48..6a2dbc576 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -73,7 +73,7 @@ func (si *slaveInodeOperations) Release(ctx context.Context) { } // Truncate implements fs.InodeOperations.Truncate. -func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { +func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { return nil } diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD new file mode 100644 index 000000000..6c798f0bd --- /dev/null +++ b/pkg/sentry/fsbridge/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "fsbridge", + srcs = [ + "bridge.go", + "fs.go", + "vfs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go new file mode 100644 index 000000000..8e7590721 --- /dev/null +++ b/pkg/sentry/fsbridge/bridge.go @@ -0,0 +1,54 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2 +// files. +package fsbridge + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// File provides a common interface to bridge between VFS1 and VFS2 files. +type File interface { + // PathnameWithDeleted returns an absolute pathname to vd, consistent with + // Linux's d_path(). In particular, if vd.Dentry() has been disowned, + // PathnameWithDeleted appends " (deleted)" to the returned pathname. + PathnameWithDeleted(ctx context.Context) string + + // ReadFull read all contents from the file. + ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) + + // ConfigureMMap mutates opts to implement mmap(2) for the file. + ConfigureMMap(context.Context, *memmap.MMapOpts) error + + // Type returns the file type, e.g. linux.S_IFREG. + Type(context.Context) (linux.FileMode, error) + + // IncRef increments reference. + IncRef() + + // DecRef decrements reference. + DecRef() +} + +// Lookup provides a common interface to open files. +type Lookup interface { + // OpenPath opens a file. + OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) +} diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go new file mode 100644 index 000000000..093ce1fb3 --- /dev/null +++ b/pkg/sentry/fsbridge/fs.go @@ -0,0 +1,181 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsbridge + +import ( + "io" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fsFile implements File interface over fs.File. +// +// +stateify savable +type fsFile struct { + file *fs.File +} + +var _ File = (*fsFile)(nil) + +// NewFSFile creates a new File over fs.File. +func NewFSFile(file *fs.File) File { + return &fsFile{file: file} +} + +// PathnameWithDeleted implements File. +func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { + root := fs.RootFromContext(ctx) + if root == nil { + // This doesn't correspond to anything in Linux because the vfs is + // global there. + return "" + } + defer root.DecRef() + + name, _ := f.file.Dirent.FullName(root) + return name +} + +// ReadFull implements File. +func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var total int64 + for dst.NumBytes() > 0 { + n, err := f.file.Preadv(ctx, dst, offset+total) + total += n + if err == io.EOF && total != 0 { + return total, io.ErrUnexpectedEOF + } else if err != nil { + return total, err + } + dst = dst.DropFirst64(n) + } + return total, nil +} + +// ConfigureMMap implements File. +func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return f.file.ConfigureMMap(ctx, opts) +} + +// Type implements File. +func (f *fsFile) Type(context.Context) (linux.FileMode, error) { + return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil +} + +// IncRef implements File. +func (f *fsFile) IncRef() { + f.file.IncRef() +} + +// DecRef implements File. +func (f *fsFile) DecRef() { + f.file.DecRef() +} + +// fsLookup implements Lookup interface using fs.File. +// +// +stateify savable +type fsLookup struct { + mntns *fs.MountNamespace + + root *fs.Dirent + workingDir *fs.Dirent +} + +var _ Lookup = (*fsLookup)(nil) + +// NewFSLookup creates a new Lookup using VFS1. +func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup { + return &fsLookup{ + mntns: mntns, + root: root, + workingDir: workingDir, + } +} + +// OpenPath implements Lookup. +func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) { + var d *fs.Dirent + var err error + if resolveFinal { + d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals) + } else { + d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals) + } + if err != nil { + return nil, err + } + defer d.DecRef() + + if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { + return nil, syserror.ELOOP + } + + fsPerm := openOptionsToPermMask(&opts) + if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil { + return nil, err + } + + // If they claim it's a directory, then make sure. + if strings.HasSuffix(path, "/") { + if d.Inode.StableAttr.Type != fs.Directory { + return nil, syserror.ENOTDIR + } + } + + if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile { + ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type) + return nil, syserror.EACCES + } + + f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags)) + if err != nil { + return nil, err + } + + return &fsFile{file: f}, nil +} + +func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask { + mode := opts.Flags & linux.O_ACCMODE + return fs.PermMask{ + Read: mode == linux.O_RDONLY || mode == linux.O_RDWR, + Write: mode == linux.O_WRONLY || mode == linux.O_RDWR, + Execute: opts.FileExec, + } +} + +func flagsToFileFlags(flags uint32) fs.FileFlags { + return fs.FileFlags{ + Direct: flags&linux.O_DIRECT != 0, + DSync: flags&(linux.O_DSYNC|linux.O_SYNC) != 0, + Sync: flags&linux.O_SYNC != 0, + NonBlocking: flags&linux.O_NONBLOCK != 0, + Read: (flags & linux.O_ACCMODE) != linux.O_WRONLY, + Write: (flags & linux.O_ACCMODE) != linux.O_RDONLY, + Append: flags&linux.O_APPEND != 0, + Directory: flags&linux.O_DIRECTORY != 0, + Async: flags&linux.O_ASYNC != 0, + LargeFile: flags&linux.O_LARGEFILE != 0, + Truncate: flags&linux.O_TRUNC != 0, + } +} diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go new file mode 100644 index 000000000..6aa17bfc1 --- /dev/null +++ b/pkg/sentry/fsbridge/vfs.go @@ -0,0 +1,138 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsbridge + +import ( + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fsFile implements File interface over vfs.FileDescription. +// +// +stateify savable +type vfsFile struct { + file *vfs.FileDescription +} + +var _ File = (*vfsFile)(nil) + +// NewVFSFile creates a new File over fs.File. +func NewVFSFile(file *vfs.FileDescription) File { + return &vfsFile{file: file} +} + +// PathnameWithDeleted implements File. +func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + + vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) + return name +} + +// ReadFull implements File. +func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var total int64 + for dst.NumBytes() > 0 { + n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{}) + total += n + if err == io.EOF && total != 0 { + return total, io.ErrUnexpectedEOF + } else if err != nil { + return total, err + } + dst = dst.DropFirst64(n) + } + return total, nil +} + +// ConfigureMMap implements File. +func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return f.file.ConfigureMMap(ctx, opts) +} + +// Type implements File. +func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) { + stat, err := f.file.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } + return linux.FileMode(stat.Mode).FileType(), nil +} + +// IncRef implements File. +func (f *vfsFile) IncRef() { + f.file.IncRef() +} + +// DecRef implements File. +func (f *vfsFile) DecRef() { + f.file.DecRef() +} + +// fsLookup implements Lookup interface using fs.File. +// +// +stateify savable +type vfsLookup struct { + mntns *vfs.MountNamespace + + root vfs.VirtualDentry + workingDir vfs.VirtualDentry +} + +var _ Lookup = (*vfsLookup)(nil) + +// NewVFSLookup creates a new Lookup using VFS2. +func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup { + return &vfsLookup{ + mntns: mntns, + root: root, + workingDir: workingDir, + } +} + +// OpenPath implements Lookup. +// +// remainingTraversals is not configurable in VFS2, all callers are using the +// default anyways. +// +// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission. +func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { + vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem() + creds := auth.CredentialsFromContext(ctx) + path := fspath.Parse(pathname) + pop := &vfs.PathOperation{ + Root: l.root, + Start: l.workingDir, + Path: path, + FollowFinalSymlink: resolveFinal, + } + if path.Absolute { + pop.Start = l.root + } + fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts) + if err != nil { + return nil, err + } + return &vfsFile{file: fd}, nil +} diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index d36fa74fb..abd4f24e7 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -28,6 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) +// Name is the default filesystem name. +const Name = "devtmpfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct { initOnce sync.Once @@ -86,7 +89,7 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth // Release must be called when a is no longer in use. func (a *Accessor) Release() { a.root.DecRef() - a.mntns.DecRef(a.vfsObj) + a.mntns.DecRef() } // accessorContext implements context.Context by extending an existing @@ -107,6 +110,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext { func (ac *accessorContext) Value(key interface{}) interface{} { switch key { case vfs.CtxMountNamespace: + ac.a.mntns.IncRef() return ac.a.mntns case vfs.CtxRoot: ac.a.root.IncRef() diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 82c58c900..b6d52c015 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } // Register tmpfs just so that we can have a root filesystem that isn't // devtmpfs. vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -45,7 +48,7 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() root := mntns.Root() defer root.DecRef() devpop := vfs.PathOperation{ diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index d1436b943..89caee3df 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -15,6 +15,9 @@ // These benchmarks emulate memfs benchmarks. Ext4 images must be created // before this benchmark is run using the `make_deep_ext4.sh` script at // /tmp/image-{depth}.ext4 for all the depths tested below. +// +// The benchmark itself cannot run the script because the script requires +// sudo privileges to create the file system images. package benchmark_test import ( @@ -49,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, nil, nil, nil, err + } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index ebb72b75e..bd6ede995 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -188,14 +188,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) } - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: child.diskDirent.FileName(), Type: fs.ToDirentType(childType), Ino: uint64(child.diskDirent.Inode()), NextOff: fd.off + 1, - }) { + }); err != nil { dir.childList.InsertBefore(child, fd.iter) - return nil + return err } fd.off++ } diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 05f992826..29bb73765 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,7 +65,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -496,9 +499,9 @@ func newIterDirentCb() *iterDirentsCb { } // Handle implements vfs.IterDirentsCallback.Handle. -func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool { +func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error { cb.dirents = append(cb.dirents, dirent) - return true + return nil } // TestIterDirents tests the FileDescriptionImpl.IterDirents functionality. diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 07bf58953..e05429d41 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -296,7 +296,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { return nil, syserror.EROFS } - return inode.open(rp, vfsd, opts.Flags) + return inode.open(rp, vfsd, &opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 191b39970..6962083f5 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -148,8 +148,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } // open creates and returns a file description for the dentry passed in. -func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) +func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) if err := in.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } @@ -157,7 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -168,17 +168,17 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: - if flags&linux.O_PATH == 0 { + if opts.Flags&linux.O_PATH == 0 { // Can't open symlinks without O_PATH. return nil, syserror.ELOOP } var fd symlinkFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD new file mode 100644 index 000000000..4ba76a1e8 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -0,0 +1,55 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "gofer", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + +go_library( + name = "gofer", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "gofer.go", + "handle.go", + "handle_unsafe.go", + "p9file.go", + "pagemath.go", + "regular_file.go", + "special_file.go", + "symlink.go", + "time.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fd", + "//pkg/fspath", + "//pkg/log", + "//pkg/p9", + "//pkg/safemem", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/unet", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go new file mode 100644 index 000000000..5dbfc6250 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -0,0 +1,194 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func (d *dentry) isDir() bool { + return d.fileType() == linux.S_IFDIR +} + +// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != +// InteropModeShared. +func (d *dentry) cacheNegativeChildLocked(name string) { + if d.negativeChildren == nil { + d.negativeChildren = make(map[string]struct{}) + } + d.negativeChildren[name] = struct{}{} +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + mu sync.Mutex + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + if fd.dirents == nil { + ds, err := fd.dentry().getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + for fd.off < int64(len(fd.dirents)) { + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). There exists at least one directoryFD representing d. +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + // 9P2000.L's readdir does not specify behavior in the presence of + // concurrent mutation of an iterated directory, so implementations may + // duplicate or omit entries in this case, which violates POSIX semantics. + // Thus we read all directory entries while holding d.dirMu to exclude + // directory mutations. (Note that it is impossible for the client to + // exclude concurrent mutation from other remote filesystem users. Since + // there is no way to detect if the server has incorrectly omitted + // directory entries, we simply assume that the server is well-behaved + // under InteropModeShared.) This is inconsistent with Linux (which appears + // to assume that directory fids have the correct semantics, and translates + // struct file_operations::readdir calls directly to readdir RPCs), but is + // consistent with VFS1. + // + // NOTE(b/135560623): In particular, some gofer implementations may not + // retain state between calls to Readdir, so may not provide a coherent + // directory stream across in the presence of mutation. + + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + if d.dirents != nil { + return d.dirents, nil + } + + // It's not clear if 9P2000.L's readdir is expected to return "." and "..", + // so we generate them here. + parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: d.ino, + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: parent.ino, + NextOff: 2, + }, + } + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + defer d.handleMu.RUnlock() + if !d.handleReadable { + // This should not be possible because a readable handle should have + // been opened when the calling directoryFD was opened. + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + return nil, err + } + if len(p9ds) == 0 { + // Cache dirents for future directoryFDs if permitted. + if d.fs.opts.interop != InteropModeShared { + d.dirents = dirents + } + return dirents, nil + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + } + off = p9ds[len(p9ds)-1].Offset + } +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go new file mode 100644 index 000000000..5cfb0dc4c --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -0,0 +1,1090 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // Snapshot current dentries and special files. + fs.syncMu.Lock() + ds := make([]*dentry, 0, len(fs.dentries)) + for d := range fs.dentries { + ds = append(ds, d) + } + sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) + for sffd := range fs.specialFileFDs { + sffds = append(sffds, sffd) + } + fs.syncMu.Unlock() + + // Return the first error we encounter, but sync everything we can + // regardless. + var retErr error + + // Sync regular files. + for _, d := range ds { + if !d.TryIncRef() { + continue + } + err := d.syncSharedHandle(ctx) + d.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + // Sync special files, which may be writable but do not use dentry shared + // handles (so they won't be synced by the above). + for _, sffd := range sffds { + if !sffd.vfsfd.TryIncRef() { + continue + } + err := sffd.Sync(ctx) + sffd.vfsfd.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + return retErr +} + +// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's +// encoding of strings, which uses 2 bytes for the length prefix. +const maxFilenameLen = (1 << 16) - 1 + +// dentrySlicePool is a pool of *[]*dentry used to store dentries for which +// dentry.checkCachingLocked() must be called. The pool holds pointers to +// slices because Go lacks generics, so sync.Pool operates on interface{}, so +// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy +// of the slice header on the heap. +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may become cached as a result of the traversal are appended +// to *ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached +// metadata must be up to date. +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + parentVFSD, err := rp.ResolveParent(&d.vfsd) + if err != nil { + return nil, err + } + parent := parentVFSD.Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // We must assume that parentVFSD is correct, because if d has been + // moved elsewhere in the remote filesystem so that its parent has + // changed, we have no way of determining its new parent's location + // in the filesystem. Get updated metadata for parentVFSD. + _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + if err != nil { + return nil, err + } + parent.updateFromP9Attrs(attrMask, &attr) + } + rp.Advance() + return parent, nil + } + childVFSD, err := rp.ResolveChild(&d.vfsd, name) + if err != nil { + return nil, err + } + // FIXME(jamieliu): Linux performs revalidation before mount lookup + // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), + // __follow_mount_rcu()). + child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + if err != nil { + return nil, err + } + if child == nil { + return nil, syserror.ENOENT + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) +// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be +// nil) to verify that the returned child (or lack thereof) is correct. If no file +// exists at name, revalidateChildLocked returns (nil, nil). +// +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// parent.isDir(). name is not "." or "..". +// +// Postconditions: If revalidateChildLocked returns a non-nil dentry, its +// cached metadata is up to date. +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { + if childVFSD != nil && fs.opts.interop != InteropModeShared { + // We have a cached dentry that is assumed to be correct. + return childVFSD.Impl().(*dentry), nil + } + // We either don't have a cached dentry or need to verify that it's still + // correct, either of which requires a remote lookup. Check if this name is + // valid before performing the lookup. + if len(name) > maxFilenameLen { + return nil, syserror.ENAMETOOLONG + } + // Check if we've already cached this lookup with a negative result. + if _, ok := parent.negativeChildren[name]; ok { + return nil, nil + } + // Perform the remote lookup. + qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) + if err != nil && err != syserror.ENOENT { + return nil, err + } + if childVFSD != nil { + child := childVFSD.Impl().(*dentry) + if !file.isNil() && qid.Path == child.ino { + // The file at this path hasn't changed. Just update cached + // metadata. + file.close(ctx) + child.updateFromP9Attrs(attrMask, &attr) + return child, nil + } + // The file at this path has changed or no longer exists. Remove + // the stale dentry from the tree, and re-evaluate its caching + // status (i.e. if it has 0 references, drop it). + vfsObj.ForceDeleteDentry(childVFSD) + *ds = appendDentry(*ds, child) + childVFSD = nil + } + if file.isNil() { + // No file exists at this path now. Cache the negative lookup if + // allowed. + if fs.opts.interop != InteropModeShared { + parent.cacheNegativeChildLocked(name) + } + return nil, nil + } + // Create a new dentry representing the file. + child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + if err != nil { + file.close(ctx) + return nil, err + } + parent.IncRef() // reference held by child on its parent + parent.vfsd.InsertChild(&child.vfsd, name) + // For now, child has 0 references, so our caller should call + // child.checkCachingLocked(). + *ds = appendDentry(*ds, child) + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == +// InteropModeShared, then d's cached metadata must be up to date. +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for rp.Start() as required by fs.stepLocked(). + if err := d.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + if parent.isDeleted() { + return syserror.ENOENT + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if len(name) > maxFilenameLen { + return syserror.ENAMETOOLONG + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + if fs.opts.interop == InteropModeShared { + // The existence of a dentry at name would be inconclusive because the + // file it represents may have been deleted from the remote filesystem, + // so we would need to make an RPC to revalidate the dentry. Just + // attempt the file creation RPC instead. If a file does exist, the RPC + // will fail with EEXIST like we would have. If the RPC succeeds, and a + // stale dentry exists, the dentry will fail revalidation next time + // it's used. + return create(parent, name) + } + if parent.vfsd.Child(name) != nil { + return syserror.EEXIST + } + // No cached dentry exists; however, there might still be an existing file + // at name. As above, we attempt the file creation RPC anyway. + if err := create(parent, name); err != nil { + return err + } + parent.touchCMtime(ctx) + delete(parent.negativeChildren, name) + parent.dirents = nil + return nil +} + +// Preconditions: !rp.Done(). +func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + name := rp.Component() + if dir { + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + } else { + if name == "." || name == ".." { + return syserror.EISDIR + } + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + childVFSD := parent.vfsd.Child(name) + var child *dentry + // We only need a dentry representing the file at name if it can be a mount + // point. If childVFSD is nil, then it can't be a mount point. If childVFSD + // is non-nil but stale, the actual file can't be a mount point either; we + // detect this case by just speculatively calling PrepareDeleteDentry and + // only revalidating the dentry if that fails (indicating that the existing + // dentry is a mount point). + if childVFSD != nil { + child = childVFSD.Impl().(*dentry) + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if err != nil { + return err + } + if child != nil { + childVFSD = &child.vfsd + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + return err + } + } else { + childVFSD = nil + } + } + } else if _, ok := parent.negativeChildren[name]; ok { + return syserror.ENOENT + } + flags := uint32(0) + if dir { + if child != nil && !child.isDir() { + return syserror.ENOTDIR + } + flags = linux.AT_REMOVEDIR + } else { + if child != nil && child.isDir() { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + } + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if childVFSD != nil { + vfsObj.AbortDeleteDentry(childVFSD) + } + return err + } + if fs.opts.interop != InteropModeShared { + parent.touchCMtime(ctx) + parent.cacheNegativeChildLocked(name) + parent.dirents = nil + } + if child != nil { + child.setDeleted() + vfsObj.CommitDeleteDentry(childVFSD) + ds = appendDentry(ds, child) + } + return nil +} + +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + // 9P2000.L supports hard links, but we don't. + return syserror.EPERM + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Reject O_TMPFILE, which is not supported; supporting it correctly in the + // presence of other remote filesystem users requires remote filesystem + // support, and it isn't clear that there's any way to implement this in + // 9P. + if opts.Flags&linux.O_TMPFILE != 0 { + return nil, syserror.EOPNOTSUPP + } + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by fs.stepLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + if rp.Done() { + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, &ds) + if err == syserror.ENOENT && mayCreate { + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) + parent.dirMu.Unlock() + return fd, err + } + if err != nil { + parent.dirMu.Unlock() + return nil, err + } + // Open existing child or follow symlink. + parent.dirMu.Unlock() + if mustCreate { + return nil, syserror.EEXIST + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + return nil, err + } + mnt := rp.Mount() + filetype := d.fileType() + switch { + case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + return nil, err + } + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case filetype == linux.S_IFDIR: + // Can't open directories with O_CREAT. + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } + fd := &directoryFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case filetype == linux.S_IFLNK: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + default: + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) + if err != nil { + return nil, err + } + fd := &specialFileFD{ + handle: h, + } + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil + } +} + +// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if d.isDeleted() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + // 9P2000.L's lcreate takes a fid representing the parent directory, and + // converts it into an open fid representing the created file, so we need + // to duplicate the directory fid first. + _, dirfile, err := d.file.walk(ctx, nil) + if err != nil { + return nil, err + } + creds := rp.Credentials() + name := rp.Component() + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + if err != nil { + dirfile.close(ctx) + return nil, err + } + // Then we need to walk to the file we just created to get a non-open fid + // representing it, and to get its metadata. This must use d.file since, as + // explained above, dirfile was invalidated by dirfile.Create(). + walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + if err != nil { + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + // Sanity-check that we walked to the file we created. + if createQID.Path != walkQID.Path { + // Probably due to concurrent remote filesystem mutation? + ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, syserror.EAGAIN + } + + // Construct the new dentry. + child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) + if err != nil { + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + // Incorporate the fid that was opened by lcreate. + useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD + if useRegularFileFD { + child.handleMu.Lock() + child.handle.file = openFile + if fdobj != nil { + child.handle.fd = int32(fdobj.Release()) + } + child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) + child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) + child.handleMu.Unlock() + } + // Take a reference on the new dentry to be held by the new file + // description. (This reference also means that the new dentry is not + // eligible for caching yet, so we don't need to append to a dentry slice.) + child.refs = 1 + // Insert the dentry into the tree. + d.IncRef() // reference held by child on its parent d + d.vfsd.InsertChild(&child.vfsd, name) + if d.fs.opts.interop != InteropModeShared { + d.touchCMtime(ctx) + delete(d.negativeChildren, name) + d.dirents = nil + } + + // Finally, construct a file description representing the created file. + var childVFSFD *vfs.FileDescription + mnt.IncRef() + if useRegularFileFD { + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + childVFSFD = &fd.vfsfd + } else { + fd := &specialFileFD{ + handle: handle{ + file: openFile, + fd: -1, + }, + } + if fdobj != nil { + fd.handle.fd = int32(fdobj.Release()) + } + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + fd.handle.close(ctx) + return nil, err + } + childVFSFD = &fd.vfsfd + } + return childVFSFD, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + if !d.isSymlink() { + return "", syserror.EINVAL + } + return d.readlink(ctx, rp.Mount()) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // Requires 9P support. + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckCaching(&ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + if err := oldParent.updateFromGetattr(ctx); err != nil { + return err + } + } + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + vfsObj := rp.VirtualFilesystem() + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + if err != nil { + return err + } + if renamed == nil { + return syserror.ENOENT + } + if renamed.isDir() { + if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.isDeleted() { + return syserror.ENOENT + } + replacedVFSD := newParent.vfsd.Child(newName) + var replaced *dentry + // This is similar to unlinkAt, except: + // + // - We revalidate the replaced dentry unconditionally for simplicity. + // + // - If rp.MustBeDir(), then we need a dentry representing the replaced + // file regardless to confirm that it's a directory. + if replacedVFSD != nil || rp.MustBeDir() { + replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) + if err != nil { + return err + } + if replaced != nil { + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + replacedVFSD = &replaced.vfsd + } else { + replacedVFSD = nil + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { + return err + } + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + if fs.opts.interop != InteropModeShared { + oldParent.cacheNegativeChildLocked(oldName) + oldParent.dirents = nil + delete(newParent.negativeChildren, newName) + newParent.dirents = nil + } + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, true /* dir */) +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + // Since walking updates metadata for all traversed dentries under + // InteropModeShared, including the returned one, we can return cached + // metadata here regardless of fs.opts.interop. + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + fsstat, err := d.file.statFS(ctx) + if err != nil { + return linux.Statfs{}, err + } + nameLen := uint64(fsstat.NameLength) + if nameLen > maxFilenameLen { + nameLen = maxFilenameLen + } + return linux.Statfs{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + BlockSize: int64(fsstat.BlockSize), + Blocks: fsstat.Blocks, + BlocksFree: fsstat.BlocksFree, + BlocksAvailable: fsstat.BlocksAvailable, + Files: fsstat.Files, + FilesFree: fsstat.FilesFree, + NameLength: nameLen, + }, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, false /* dir */) +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + return d.listxattr(ctx) +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return d.getxattr(ctx, name) +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.setxattr(ctx, &opts) +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.removexattr(ctx, name) +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go new file mode 100644 index 000000000..c4a8f0b38 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -0,0 +1,1150 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gofer provides a filesystem implementation that is backed by a 9p +// server, interchangably referred to as "gofers" throughout this package. +// +// Lock order: +// regularFileFD/directoryFD.mu +// filesystem.renameMu +// dentry.dirMu +// filesystem.syncMu +// dentry.metadataMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.handleMu +// dentry.dataMu +// +// Locking dentry.dirMu in multiple dentries requires holding +// filesystem.renameMu for writing. +package gofer + +import ( + "fmt" + "strconv" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Name is the default filesystem name. +const Name = "9p" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // mfp is used to allocate memory that caches regular file contents. mfp is + // immutable. + mfp pgalloc.MemoryFileProvider + + // Immutable options. + opts filesystemOptions + + // client is the client used by this filesystem. client is immutable. + client *p9.Client + + // uid and gid are the effective KUID and KGID of the filesystem's creator, + // and are used as the owner and group for files that don't specify one. + // uid and gid are immutable. + uid auth.KUID + gid auth.KGID + + // renameMu serves two purposes: + // + // - It synchronizes path resolution with renaming initiated by this + // client. + // + // - It is held by path resolution to ensure that reachable dentries remain + // valid. A dentry is reachable by path resolution if it has a non-zero + // reference count (such that it is usable as vfs.ResolvingPath.Start() or + // is reachable from its children), or if it is a child dentry (such that + // it is reachable from its parent). + renameMu sync.RWMutex + + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by renameMu. + cachedDentries dentryList + cachedDentriesLen uint64 + + // dentries contains all dentries in this filesystem. specialFileFDs + // contains all open specialFileFDs. These fields are protected by syncMu. + syncMu sync.Mutex + dentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} +} + +type filesystemOptions struct { + // "Standard" 9P options. + fd int + aname string + interop InteropMode // derived from the "cache" mount option + msize uint32 + version string + + // maxCachedDentries is the maximum number of dentries with 0 references + // retained by the client. + maxCachedDentries uint64 + + // If forcePageCache is true, host FDs may not be used for application + // memory mappings even if available; instead, the client must perform its + // own caching of regular file pages. This is primarily useful for testing. + forcePageCache bool + + // If limitHostFDTranslation is true, apply maxFillRange() constraints to + // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This + // makes memory accounting behavior more consistent between cases where + // host FDs are / are not available, but may increase the frequency of + // sentry-handled page faults on files for which a host FD is available. + limitHostFDTranslation bool + + // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote + // filesystem may not be coherent with writable host FDs opened later, so + // mappings of the former must be replaced by mappings of the latter. This + // is usually only the case when the remote filesystem is an overlayfs + // mount on Linux < 4.19. + overlayfsStaleRead bool + + // If regularFilesUseSpecialFileFD is true, application FDs representing + // regular files will use distinct file handles for each FD, in the same + // way that application FDs representing "special files" such as sockets + // do. Note that this disables client caching and mmap for regular files. + regularFilesUseSpecialFileFD bool +} + +// InteropMode controls the client's interaction with other remote filesystem +// users. +type InteropMode uint32 + +const ( + // InteropModeExclusive is appropriate when the filesystem client is the + // only user of the remote filesystem. + // + // - The client may cache arbitrary filesystem state (file data, metadata, + // filesystem structure, etc.). + // + // - Client changes to filesystem state may be sent to the remote + // filesystem asynchronously, except when server permission checks are + // necessary. + // + // - File timestamps are based on client clocks. This ensures that users of + // the client observe timestamps that are coherent with their own clocks + // and consistent with Linux's semantics. However, since it is not always + // possible for clients to set arbitrary atimes and mtimes, and never + // possible for clients to set arbitrary ctimes, file timestamp changes are + // stored in the client only and never sent to the remote filesystem. + InteropModeExclusive InteropMode = iota + + // InteropModeWritethrough is appropriate when there are read-only users of + // the remote filesystem that expect to observe changes made by the + // filesystem client. + // + // - The client may cache arbitrary filesystem state. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on client clocks. As a corollary, access + // timestamp changes from other remote filesystem users will not be visible + // to the client. + InteropModeWritethrough + + // InteropModeShared is appropriate when there are users of the remote + // filesystem that may mutate its state other than the client. + // + // - The client must verify cached filesystem state before using it. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on server clocks. This is necessary to + // ensure that timestamp changes are synchronized between remote filesystem + // users. + // + // Note that the correctness of InteropModeShared depends on the server + // correctly implementing 9P fids (i.e. each fid immutably represents a + // single filesystem object), even in the presence of remote filesystem + // mutations from other users. If this is violated, the behavior of the + // client is undefined. + InteropModeShared +) + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") + return nil, nil, syserror.EINVAL + } + + mopts := vfs.GenericParseMountOptions(opts.Data) + var fsopts filesystemOptions + + // Check that the transport is "fd". + trans, ok := mopts["trans"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'") + return nil, nil, syserror.EINVAL + } + delete(mopts, "trans") + if trans != "fd" { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans) + return nil, nil, syserror.EINVAL + } + + // Check that read and write FDs are provided and identical. + rfdstr, ok := mopts["rfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>") + return nil, nil, syserror.EINVAL + } + delete(mopts, "rfdno") + rfd, err := strconv.Atoi(rfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr) + return nil, nil, syserror.EINVAL + } + wfdstr, ok := mopts["wfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>") + return nil, nil, syserror.EINVAL + } + delete(mopts, "wfdno") + wfd, err := strconv.Atoi(wfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr) + return nil, nil, syserror.EINVAL + } + if rfd != wfd { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd) + return nil, nil, syserror.EINVAL + } + fsopts.fd = rfd + + // Get the attach name. + fsopts.aname = "/" + if aname, ok := mopts["aname"]; ok { + delete(mopts, "aname") + fsopts.aname = aname + } + + // Parse the cache policy. For historical reasons, this defaults to the + // least generally-applicable option, InteropModeExclusive. + fsopts.interop = InteropModeExclusive + if cache, ok := mopts["cache"]; ok { + delete(mopts, "cache") + switch cache { + case "fscache": + fsopts.interop = InteropModeExclusive + case "fscache_writethrough": + fsopts.interop = InteropModeWritethrough + case "none": + fsopts.regularFilesUseSpecialFileFD = true + fallthrough + case "remote_revalidating": + fsopts.interop = InteropModeShared + default: + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache) + return nil, nil, syserror.EINVAL + } + } + + // Parse the 9P message size. + fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M + if msizestr, ok := mopts["msize"]; ok { + delete(mopts, "msize") + msize, err := strconv.ParseUint(msizestr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr) + return nil, nil, syserror.EINVAL + } + fsopts.msize = uint32(msize) + } + + // Parse the 9P protocol version. + fsopts.version = p9.HighestVersionString() + if version, ok := mopts["version"]; ok { + delete(mopts, "version") + fsopts.version = version + } + + // Parse the dentry cache limit. + fsopts.maxCachedDentries = 1000 + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + maxCachedDentries, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return nil, nil, syserror.EINVAL + } + fsopts.maxCachedDentries = maxCachedDentries + } + + // Handle simple flags. + if _, ok := mopts["force_page_cache"]; ok { + delete(mopts, "force_page_cache") + fsopts.forcePageCache = true + } + if _, ok := mopts["limit_host_fd_translation"]; ok { + delete(mopts, "limit_host_fd_translation") + fsopts.limitHostFDTranslation = true + } + if _, ok := mopts["overlayfs_stale_read"]; ok { + delete(mopts, "overlayfs_stale_read") + fsopts.overlayfsStaleRead = true + } + // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying + // "cache=none". + + // Check for unparsed options. + if len(mopts) != 0 { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + // Establish a connection with the server. + conn, err := unet.NewSocket(fsopts.fd) + if err != nil { + return nil, nil, err + } + + // Perform version negotiation with the server. + ctx.UninterruptibleSleepStart(false) + client, err := p9.NewClient(conn, fsopts.msize, fsopts.version) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + conn.Close() + return nil, nil, err + } + // Ownership of conn has been transferred to client. + + // Perform attach to obtain the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := client.Attach(fsopts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + client.Close() + return nil, nil, err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } + + // Construct the filesystem object. + fs := &filesystem{ + mfp: mfp, + opts: fsopts, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + client: client, + dentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), + } + fs.vfsfs.Init(vfsObj, fs) + + // Construct the root dentry. + root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) + if err != nil { + attachFile.close(ctx) + fs.vfsfs.DecRef() + return nil, nil, err + } + // Set the root's reference count to 2. One reference is returned to the + // caller, and the other is deliberately leaked to prevent the root from + // being "cached" and subsequently evicted. Its resources will still be + // cleaned up by fs.Release(). + root.refs = 2 + + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + ctx := context.Background() + mf := fs.mfp.MemoryFile() + + fs.syncMu.Lock() + for d := range fs.dentries { + d.handleMu.Lock() + d.dataMu.Lock() + if d.handleWritable { + // Write dirty cached data to the remote file. + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) + } + // TODO(jamieliu): Do we need to flushf/fsync d? + } + // Discard cached pages. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Close the host fd if one exists. + if d.handle.fd >= 0 { + syscall.Close(int(d.handle.fd)) + d.handle.fd = -1 + } + d.handleMu.Unlock() + } + // There can't be any specialFileFDs still using fs, since each such + // FileDescription would hold a reference on a Mount holding a reference on + // fs. + fs.syncMu.Unlock() + + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. refs is accessed using atomic memory + // operations. + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // We don't support hard links, so each dentry maps 1:1 to an inode. + + // file is the unopened p9.File that backs this dentry. file is immutable. + file p9file + + // If deleted is non-zero, the file represented by this dentry has been + // deleted. deleted is accessed using atomic memory operations. + deleted uint32 + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // filesystem.renameMu. + cached bool + dentryEntry + + dirMu sync.Mutex + + // If this dentry represents a directory, and InteropModeShared is not in + // effect, negativeChildren is a set of child names in this directory that + // are known not to exist. negativeChildren is protected by dirMu. + negativeChildren map[string]struct{} + + // If this dentry represents a directory, InteropModeShared is not in + // effect, and dirents is not nil, it is a cache of all entries in the + // directory, in the order they were returned by the server. dirents is + // protected by dirMu. + dirents []vfs.Dirent + + // Cached metadata; protected by metadataMu and accessed using atomic + // memory operations unless otherwise specified. + metadataMu sync.Mutex + ino uint64 // immutable + mode uint32 // type is immutable, perms are mutable + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + blockSize uint32 // 0 if unknown + // Timestamps, all nsecs from the Unix epoch. + atime int64 + mtime int64 + ctime int64 + btime int64 + // File size, protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it). + size uint64 + + mapsMu sync.Mutex + + // If this dentry represents a regular file, mappings tracks mappings of + // the file into memmap.MappingSpaces. mappings is protected by mapsMu. + mappings memmap.MappingSet + + // If this dentry represents a regular file or directory: + // + // - handle is the I/O handle used by all regularFileFDs/directoryFDs + // representing this dentry. + // + // - handleReadable is true if handle is readable. + // + // - handleWritable is true if handle is writable. + // + // Invariants: + // + // - If handleReadable == handleWritable == false, then handle.file == nil + // (i.e. there is no open handle). Conversely, if handleReadable || + // handleWritable == true, then handle.file != nil (i.e. there is an open + // handle). + // + // - handleReadable and handleWritable cannot transition from true to false + // (i.e. handles may not be downgraded). + // + // These fields are protected by handleMu. + handleMu sync.RWMutex + handle handle + handleReadable bool + handleWritable bool + + dataMu sync.RWMutex + + // If this dentry represents a regular file that is client-cached, cache + // maps offsets into the cached file to offsets into + // filesystem.mfp.MemoryFile() that store the file's data. cache is + // protected by dataMu. + cache fsutil.FileRangeSet + + // If this dentry represents a regular file that is client-cached, dirty + // tracks dirty segments in cache. dirty is protected by dataMu. + dirty fsutil.DirtySet + + // pf implements platform.File for mappings of handle.fd. + pf dentryPlatformFile + + // If this dentry represents a symbolic link, InteropModeShared is not in + // effect, and haveTarget is true, target is the symlink target. haveTarget + // and target are protected by dataMu. + haveTarget bool + target string +} + +// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the +// gofer client. +func dentryAttrMask() p9.AttrMask { + return p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + ATime: true, + MTime: true, + CTime: true, + Size: true, + BTime: true, + } +} + +// newDentry creates a new dentry representing the given file. The dentry +// initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.checkCachingLocked() as appropriate. +func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { + if !mask.Mode { + ctx.Warningf("can't create gofer.dentry without file type") + return nil, syserror.EIO + } + if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { + ctx.Warningf("can't create regular file gofer.dentry without file size") + return nil, syserror.EIO + } + + d := &dentry{ + fs: fs, + file: file, + ino: qid.Path, + mode: uint32(attr.Mode), + uid: uint32(fs.uid), + gid: uint32(fs.gid), + blockSize: usermem.PageSize, + handle: handle{ + fd: -1, + }, + } + d.pf.dentry = d + if mask.UID { + d.uid = uint32(attr.UID) + } + if mask.GID { + d.gid = uint32(attr.GID) + } + if mask.Size { + d.size = attr.Size + } + if attr.BlockSize != 0 { + d.blockSize = uint32(attr.BlockSize) + } + if mask.ATime { + d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds) + } + if mask.MTime { + d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds) + } + if mask.CTime { + d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds) + } + if mask.BTime { + d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) + } + d.vfsd.Init(d) + + fs.syncMu.Lock() + fs.dentries[d] = struct{}{} + fs.syncMu.Unlock() + return d, nil +} + +// updateFromP9Attrs is called to update d's metadata after an update from the +// remote filesystem. +func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { + d.metadataMu.Lock() + if mask.Mode { + if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { + d.metadataMu.Unlock() + panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) + } + atomic.StoreUint32(&d.mode, uint32(attr.Mode)) + } + if mask.UID { + atomic.StoreUint32(&d.uid, uint32(attr.UID)) + } + if mask.GID { + atomic.StoreUint32(&d.gid, uint32(attr.GID)) + } + // There is no P9_GETATTR_* bit for I/O block size. + if attr.BlockSize != 0 { + atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) + } + if mask.ATime { + atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) + } + if mask.MTime { + atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) + } + if mask.CTime { + atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)) + } + if mask.BTime { + atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) + } + if mask.Size { + d.dataMu.Lock() + atomic.StoreUint64(&d.size, attr.Size) + d.dataMu.Unlock() + } + d.metadataMu.Unlock() +} + +func (d *dentry) updateFromGetattr(ctx context.Context) error { + // Use d.handle.file, which represents a 9P fid that has been opened, in + // preference to d.file, which represents a 9P fid that has not. This may + // be significantly more efficient in some implementations. + var ( + file p9file + handleMuRLocked bool + ) + d.handleMu.RLock() + if !d.handle.file.isNil() { + file = d.handle.file + handleMuRLocked = true + } else { + file = d.file + d.handleMu.RUnlock() + } + _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) + if handleMuRLocked { + d.handleMu.RUnlock() + } + if err != nil { + return err + } + d.updateFromP9Attrs(attrMask, &attr) + return nil +} + +func (d *dentry) fileType() uint32 { + return atomic.LoadUint32(&d.mode) & linux.S_IFMT +} + +func (d *dentry) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME + stat.Blksize = atomic.LoadUint32(&d.blockSize) + stat.Nlink = 1 + if d.isDir() { + stat.Nlink = 2 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = d.ino + stat.Size = atomic.LoadUint64(&d.size) + // This is consistent with regularFileFD.Seek(), which treats regular files + // as having no holes. + stat.Blocks = (stat.Size + 511) / 512 + stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime)) + stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) + stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) + stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) + // TODO(jamieliu): device number +} + +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { + if stat.Mask == 0 { + return nil + } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + setLocalAtime := false + setLocalMtime := false + if d.fs.opts.interop != InteropModeShared { + // Timestamp updates will be handled locally. + setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 + setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 + stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME + if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + } + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since it'll + // be overwritten by revalidation before the next time it's used + // anyway. (InteropModeShared inhibits client caching of regular file + // data, so there's no cache to truncate either.) + return nil + } + now, haveNow := nowFromContext(ctx) + if !haveNow { + ctx.Warningf("gofer.dentry.setStat: current time not available") + } + if stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) + } + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, stat.GID) + } + if setLocalAtime { + if stat.Atime.Nsec == linux.UTIME_NOW { + if haveNow { + atomic.StoreInt64(&d.atime, now) + } + } else { + atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) + } + } + if setLocalMtime { + if stat.Mtime.Nsec == linux.UTIME_NOW { + if haveNow { + atomic.StoreInt64(&d.mtime, now) + } + } else { + atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) + } + } + if haveNow { + atomic.StoreInt64(&d.ctime, now) + } + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Lock() + oldSize := d.size + d.size = stat.Size + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend := pageRoundUp(oldSize) + newpgend := pageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. + d.dataMu.Lock() + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) + d.dataMu.Unlock() + } + } + return nil +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { + return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkCachingLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked() + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } +} + +// checkCachingLocked should be called after d's reference count becomes 0 or it +// becomes disowned. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkCachingLocked() { + // Dentries with a non-zero reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) + if atomic.LoadInt64(&d.refs) != 0 { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + return + } + // Non-child dentries with zero references are no longer reachable by path + // resolution and should be dropped immediately. + if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + d.destroyLocked() + return + } + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries { + victim := d.fs.cachedDentries.Back() + d.fs.cachedDentries.Remove(victim) + d.fs.cachedDentriesLen-- + victim.cached = false + // victim.refs may have become non-zero from an earlier path + // resolution since it was inserted into fs.cachedDentries; see + // dentry.incRefLocked(). Either way, we brought + // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so + // we don't loop. + if atomic.LoadInt64(&victim.refs) == 0 { + if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { + victimParent := victimParentVFSD.Impl().(*dentry) + victimParent.dirMu.Lock() + if !victim.vfsd.IsDisowned() { + // victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount + // points. + d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + // We're only deleting the dentry, not the file it + // represents, so we don't need to update + // victimParent.dirents etc. + } + victimParent.dirMu.Unlock() + } + victim.destroyLocked() + } + } +} + +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is +// not a child dentry. +func (d *dentry) destroyLocked() { + ctx := context.Background() + d.handleMu.Lock() + if !d.handle.file.isNil() { + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + // Write dirty pages back to the remote filesystem. + if d.handleWritable { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) + } + } + // Discard cached data. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + d.handle.close(ctx) + } + d.handleMu.Unlock() + d.file.close(ctx) + // Remove d from the set of all dentries. + d.fs.syncMu.Lock() + delete(d.fs.dentries, d) + d.fs.syncMu.Unlock() + // Drop the reference held by d on its parent. + if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { + parent := parentVFSD.Impl().(*dentry) + // This is parent.DecRef() without recursive locking of d.fs.renameMu. + if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { + parent.checkCachingLocked() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } + } +} + +func (d *dentry) isDeleted() bool { + return atomic.LoadUint32(&d.deleted) != 0 +} + +func (d *dentry) setDeleted() { + atomic.StoreUint32(&d.deleted, 1) +} + +func (d *dentry) listxattr(ctx context.Context) ([]string, error) { + return nil, syserror.ENOTSUP +} + +func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { + // TODO(jamieliu): add vfs.GetxattrOptions.Size + return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +} + +func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { + return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) +} + +func (d *dentry) removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + +// Preconditions: d.isRegularFile() || d.isDirectory(). +func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { + // O_TRUNC unconditionally requires us to obtain a new handle (opened with + // O_TRUNC). + if !trunc { + d.handleMu.RLock() + if (!read || d.handleReadable) && (!write || d.handleWritable) { + // The current handle is sufficient. + d.handleMu.RUnlock() + return nil + } + d.handleMu.RUnlock() + } + + haveOldFD := false + d.handleMu.Lock() + if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { + // Get a new handle. + wantReadable := d.handleReadable || read + wantWritable := d.handleWritable || write + h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if err != nil { + d.handleMu.Unlock() + return err + } + if !d.handle.file.isNil() { + // Check that old and new handles are compatible: If the old handle + // includes a host file descriptor but the new one does not, or + // vice versa, old and new memory mappings may be incoherent. + haveOldFD = d.handle.fd >= 0 + haveNewFD := h.fd >= 0 + if haveOldFD != haveNewFD { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) + h.close(ctx) + return syserror.EIO + } + if haveOldFD { + // We may have raced with callers of d.pf.FD() that are now + // using the old file descriptor, preventing us from safely + // closing it. We could handle this by invalidating existing + // memmap.Translations, but this is expensive. Instead, use + // dup3 to make the old file descriptor refer to the new file + // description, then close the new file descriptor (which is no + // longer needed). Racing callers may use the old or new file + // description, but this doesn't matter since they refer to the + // same file (unless d.fs.opts.overlayfsStaleRead is true, + // which we handle separately). + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + h.close(ctx) + return err + } + syscall.Close(int(h.fd)) + h.fd = d.handle.fd + if d.fs.opts.overlayfsStaleRead { + // Replace sentry mappings of the old FD with mappings of + // the new FD, since the two are not necessarily coherent. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) + h.close(ctx) + return err + } + } + // Clunk the old fid before making the new handle visible (by + // unlocking d.handleMu). + d.handle.file.close(ctx) + } + } + // Switch to the new handle. + d.handle = h + d.handleReadable = wantReadable + d.handleWritable = wantWritable + } + d.handleMu.Unlock() + + if d.fs.opts.overlayfsStaleRead && haveOldFD { + // Invalidate application mappings that may be using the old FD; they + // will be replaced with mappings using the new FD after future calls + // to d.Translate(). This requires holding d.mapsMu, which precedes + // d.handleMu in the lock order. + d.mapsMu.Lock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + } + + return nil +} + +// fileDescription is embedded by gofer implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + d := fd.dentry() + if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if + // available? + if err := d.updateFromGetattr(ctx); err != nil { + return linux.Statx{}, err + } + } + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()) +} + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { + return fd.dentry().listxattr(ctx) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { + return fd.dentry().getxattr(ctx, name) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + return fd.dentry().setxattr(ctx, &opts) +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + return fd.dentry().removexattr(ctx, name) +} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go new file mode 100644 index 000000000..cfe66f797 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/safemem" +) + +// handle represents a remote "open file descriptor", consisting of an opened +// fid (p9.File) and optionally a host file descriptor. +type handle struct { + file p9file + fd int32 // -1 if unavailable +} + +// Preconditions: read || write. +func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) { + _, newfile, err := file.walk(ctx, nil) + if err != nil { + return handle{fd: -1}, err + } + var flags p9.OpenFlags + switch { + case read && !write: + flags = p9.ReadOnly + case !read && write: + flags = p9.WriteOnly + case read && write: + flags = p9.ReadWrite + } + if trunc { + flags |= p9.OpenTruncate + } + fdobj, _, _, err := newfile.open(ctx, flags) + if err != nil { + newfile.close(ctx) + return handle{fd: -1}, err + } + fd := int32(-1) + if fdobj != nil { + fd = int32(fdobj.Release()) + } + return handle{ + file: newfile, + fd: fd, + }, nil +} + +func (h *handle) close(ctx context.Context) { + h.file.close(ctx) + h.file = p9file{} + if h.fd >= 0 { + syscall.Close(int(h.fd)) + h.fd = -1 + } +} + +func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostPreadv(h.fd, dsts, int64(offset)) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { + n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the read since p9.File.ReadAt() takes []byte. + buf := make([]byte, dsts.NumBytes()) + n, err := h.file.readAt(ctx, buf, offset) + if n == 0 { + return 0, err + } + if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { + return cp, cperr + } + return uint64(n), err +} + +func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostPwritev(h.fd, srcs, int64(offset)) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { + n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the write since p9.File.WriteAt() takes []byte. + buf := make([]byte, srcs.NumBytes()) + cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs) + if cp == 0 { + return 0, cperr + } + n, err := h.file.writeAt(ctx, buf[:cp], offset) + if err != nil { + return uint64(n), err + } + return cp, cperr +} + +func (h *handle) sync(ctx context.Context) error { + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(h.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + return h.file.fsync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go new file mode 100644 index 000000000..19560ab26 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle_unsafe.go @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/safemem" +) + +// Preconditions: !dsts.IsEmpty(). +func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + if dsts.NumBlocks() == 1 { + // Use pread() instead of preadv() to avoid iovec allocation and + // copying. + dst := dsts.Head() + n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil + } + iovs := safemem.IovecsFromBlockSeq(dsts) + n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil +} + +// Preconditions: !srcs.IsEmpty(). +func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + if srcs.NumBlocks() == 1 { + // Use pwrite() instead of pwritev() to avoid iovec allocation and + // copying. + src := srcs.Head() + n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil + } + iovs := safemem.IovecsFromBlockSeq(srcs) + n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil +} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go new file mode 100644 index 000000000..755ac2985 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -0,0 +1,219 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/syserror" +) + +// p9file is a wrapper around p9.File that provides methods that are +// Context-aware. +type p9file struct { + file p9.File +} + +func (f p9file) isNil() bool { + return f.file == nil +} + +func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, err := f.file.Walk(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, err +} + +func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, attrMask, attr, err +} + +// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single +// path component and returns a single qid. +func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name}) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err + } + if len(qids) != 1 { + ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids) + if newfile != nil { + p9file{newfile}.close(ctx) + } + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + } + return qids[0], p9file{newfile}, attrMask, attr, nil +} + +func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) { + ctx.UninterruptibleSleepStart(false) + fsstat, err := f.file.StatFS() + ctx.UninterruptibleSleepFinish(false) + return fsstat, err +} + +func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qid, attrMask, attr, err := f.file.GetAttr(req) + ctx.UninterruptibleSleepFinish(false) + return qid, attrMask, attr, err +} + +func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetAttr(valid, attr) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { + ctx.UninterruptibleSleepStart(false) + val, err := f.file.GetXattr(name, size) + ctx.UninterruptibleSleepFinish(false) + return val, err +} + +func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetXattr(name, value, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Allocate(mode, offset, length) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) close(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Close() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, qid, iounit, err := f.file.Open(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, qid, iounit, err +} + +func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.ReadAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.WriteAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) fsync(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.FSync() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return fdobj, p9file{newfile}, qid, iounit, err +} + +func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mkdir(name, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Symlink(oldName, newName, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) link(ctx context.Context, target p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Link(target.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mknod(name, mode, major, minor, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Rename(newDir.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.UnlinkAt(name, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) { + ctx.UninterruptibleSleepStart(false) + dirents, err := f.file.Readdir(offset, count) + ctx.UninterruptibleSleepFinish(false) + return dirents, err +} + +func (f p9file) readlink(ctx context.Context) (string, error) { + ctx.UninterruptibleSleepStart(false) + target, err := f.file.Readlink() + ctx.UninterruptibleSleepFinish(false) + return target, err +} + +func (f p9file) flush(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Flush() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, err := f.file.Connect(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, err +} diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go new file mode 100644 index 000000000..847cb0784 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/pagemath.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/usermem" +) + +// This are equivalent to usermem.Addr.RoundDown/Up, but without the +// potentially truncating conversion to usermem.Addr. This is necessary because +// there is no way to define generic "PageRoundDown/Up" functions in Go. + +func pageRoundDown(x uint64) uint64 { + return x &^ (usermem.PageSize - 1) +} + +func pageRoundUp(x uint64) uint64 { + return pageRoundDown(x + usermem.PageSize - 1) +} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go new file mode 100644 index 000000000..e95209661 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -0,0 +1,872 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isRegularFile() bool { + return d.fileType() == linux.S_IFREG +} + +type regularFileFD struct { + fileDescription + + // off is the file offset. off is protected by mu. + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *regularFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + // Skip flushing if writes may be buffered by the client, since (as with + // the VFS1 client) we don't flush buffered writes on close anyway. + d := fd.dentry() + if d.fs.opts.interop == InteropModeExclusive { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.handle.file.flush(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Check for reading at EOF before calling into MM (but not under + // InteropModeShared, which makes d.size unreliable). + d := fd.dentry() + if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + return 0, io.EOF + } + + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Lock d.metadataMu for the rest of the read to prevent d.size from + // changing. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + // Write dirty cached pages that will be touched by the read back to + // the remote file. + if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { + return 0, err + } + } + + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the read to go to the remote file. + rw.direct = true + } + n, err := dst.CopyOutFrom(ctx, rw) + putDentryReadWriter(rw) + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + d.touchAtime(ctx, fd.vfsfd.Mount()) + } + return n, err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => + // file_update_time(). This is d.touchCMtime(), but without locking + // d.metadataMu (recursively). + if now, ok := nowFromContext(ctx); ok { + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + } + } + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Remove touched pages from the cache. + pgstart := pageRoundDown(uint64(offset)) + pgend := pageRoundUp(uint64(offset + src.NumBytes())) + if pgend < pgstart { + return 0, syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []platform.FileRange + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + } + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the write to go to the remote file. + rw.direct = true + } + n, err := src.CopyInTo(ctx, rw) + putDentryReadWriter(rw) + if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Write dirty cached pages touched by the write back to the remote + // file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Request the remote filesystem to sync the remote file. + if err := d.handle.file.fsync(ctx); err != nil { + return 0, err + } + } + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +type dentryReadWriter struct { + ctx context.Context + d *dentry + off uint64 + direct bool +} + +var dentryReadWriterPool = sync.Pool{ + New: func() interface{} { + return &dentryReadWriter{} + }, +} + +func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { + rw := dentryReadWriterPool.Get().(*dentryReadWriter) + rw.ctx = ctx + rw.d = d + rw.off = uint64(offset) + rw.direct = false + return rw +} + +func putDentryReadWriter(rw *dentryReadWriter) { + rw.ctx = nil + rw.d = nil + dentryReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents and makes dentry.size + // unreliable), or if the file was opened O_DIRECT, read directly from + // dentry.handle without locking dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.d.handleMu.RUnlock() + rw.off += n + return n, err + } + + // Otherwise read from/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + fillCache := mf.ShouldCacheEvictable() + var dataMuUnlock func() + if fillCache { + rw.d.dataMu.Lock() + dataMuUnlock = rw.d.dataMu.Unlock + } else { + rw.d.dataMu.RLock() + dataMuUnlock = rw.d.dataMu.RUnlock + } + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= rw.d.size { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return 0, io.EOF + } + end := rw.d.size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + gapMR := gap.Range().Intersect(mr) + if fillCache { + // Read into the cache, then re-enter the loop to read from the + // cache. + reqMR := memmap.MappableRange{ + Start: pageRoundDown(gapMR.Start), + End: pageRoundUp(gapMR.End), + } + optMR := gap.Range() + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) + seg, gap = rw.d.cache.Find(rw.off) + if !seg.Ok() { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + // err might have occurred in part of gap.Range() outside + // gapMR. Forget about it for now; if the error matters and + // persists, we'll run into it again in a later iteration of + // this loop. + } else { + // Read directly from the file. + gapDsts := dsts.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != gapDsts.NumBytes() || err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + } + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.d.metadataMu must be locked. +func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents), or if the file was + // opened with O_DIRECT, write directly to dentry.handle without locking + // dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.off += n + rw.d.dataMu.Lock() + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() + return n, err + } + + // Otherwise write to/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + rw.d.dataMu.Lock() + + // Compute the range to write (overflow-checked). + start := rw.off + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + var ( + done uint64 + retErr error + ) + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + rw.d.dirty.MarkDirty(segMR) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Write directly to the file. At present, we never fill the cache + // when writing, since doing so can convert small writes into + // inefficient read-modify-write cycles, and we have no mechanism + // for detecting or avoiding this. + gapMR := gap.Range().Intersect(mr) + gapSrcs := srcs.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != gapSrcs.NumBytes() || err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + // If InteropModeWritethrough is in effect, flush written data back to the + // remote filesystem. + if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { + if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ + Start: start, + End: rw.off, + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + // We have no idea how many bytes were actually flushed. + rw.off = start + done = 0 + retErr = err + } + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() + return done, retErr +} + +func (d *dentry) writeback(ctx context.Context, offset, size int64) error { + if size == 0 { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + // Compute the range of valid bytes (overflow-checked). + if uint64(offset) >= d.size { + return nil + } + end := int64(d.size) + if rend := offset + size; rend > offset && rend < end { + end = rend + } + return fsutil.SyncDirty(ctx, memmap.MappableRange{ + Start: uint64(offset), + End: uint64(end), + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: + // Ensure file size is up to date. + d := fd.dentry() + if fd.filesystem().opts.interop == InteropModeShared { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, err + } + } + size := int64(atomic.LoadUint64(&d.size)) + // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous + // block of data. + switch whence { + case linux.SEEK_END: + offset += size + case linux.SEEK_DATA: + if offset > size { + return 0, syserror.ENXIO + } + // Use offset as specified. + case linux.SEEK_HOLE: + if offset > size { + return 0, syserror.ENXIO + } + offset = size + } + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return fd.dentry().syncSharedHandle(ctx) +} + +func (d *dentry) syncSharedHandle(ctx context.Context) error { + d.handleMu.RLock() + if !d.handleWritable { + d.handleMu.RUnlock() + return nil + } + d.dataMu.Lock() + // Write dirty cached data to the remote file. + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + d.dataMu.Unlock() + if err == nil { + // Sync the remote file. + err = d.handle.sync(ctx) + } + d.handleMu.RUnlock() + return err +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + switch d.fs.opts.interop { + case InteropModeExclusive: + // Any mapping is fine. + case InteropModeWritethrough: + // Shared writable mappings require a host FD, since otherwise we can't + // synchronously flush memory-mapped writes to the remote file. + if opts.Private || !opts.MaxPerms.Write { + break + } + fallthrough + case InteropModeShared: + // All mappings require a host FD to be coherent with other filesystem + // users. + if d.fs.opts.forcePageCache { + // Whether or not we have a host FD, we're not allowed to use it. + return syserror.ENODEV + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + if !haveFD { + return syserror.ENODEV + } + default: + panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) + } + // After this point, d may be used as a memmap.Mappable. + d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) + return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) +} + +func (d *dentry) mayCachePages() bool { + if d.fs.opts.interop == InteropModeShared { + return false + } + if d.fs.opts.forcePageCache { + return true + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + return haveFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + mapped := d.mappings.AddMapping(ms, ar, offset, writable) + // Do this unconditionally since whether we have a host FD can change + // across save/restore. + for _, r := range mapped { + d.pf.hostFileMapper.IncRefOn(r) + } + if d.mayCachePages() { + // d.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := d.fs.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + } + d.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + d.pf.hostFileMapper.DecRefOn(r) + } + if d.mayCachePages() { + // Pages that are no longer referenced by any application memory + // mappings are now considered unused; allow MemoryFile to evict them + // when necessary. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + for _, r := range unmapped { + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + d.dirty.AllowClean(r) + mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + d.dataMu.Unlock() + } + d.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return d.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.handleMu.RLock() + if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + d.handleMu.RUnlock() + mr := optional + if d.fs.opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: &d.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil + } + + d.dataMu.Lock() + + // Constrain translations to d.size (rounded up) to prevent translation to + // pages that may be concurrently truncated. + pgend := pageRoundUp(d.size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + d.dataMu.Unlock() + d.handleMu.RUnlock() + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mf := d.fs.mfp.MemoryFile() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + // TODO(jamieliu): Make Translations writable even if writability is + // not required if already kept-dirty by another writable translation. + perms := usermem.AccessType{ + Read: true, + Execute: true, + } + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + d.dirty.KeepDirty(segMR) + perms.Write = true + } + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mf, + Offset: seg.FileRangeOf(segMR).Start, + Perms: perms, + }) + translatedEnd = segMR.End + } + + d.dataMu.Unlock() + d.handleMu.RUnlock() + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Write the cache's contents back to the remote file so that if we have a + // host fd after restore, the remote file's contents are coherent. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + + return nil +} + +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + d.cache.Drop(mgapMR, mf) + d.dirty.KeepClean(mgapMR) + } +} + +// dentryPlatformFile implements platform.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// +// dentryPlatformFile is only used when a host FD representing the remote file +// is available (i.e. dentry.handle.fd >= 0), and that FD is used for +// application memory mappings (i.e. !filesystem.opts.forcePageCache). +type dentryPlatformFile struct { + *dentry + + // fdRefs counts references on platform.File offsets. fdRefs is protected + // by dentry.dataMu. + fdRefs fsutil.FrameRefSet + + // If this dentry represents a regular file, and handle.fd >= 0, + // hostFileMapper caches mappings of handle.fd. + hostFileMapper fsutil.HostFileMapper + + // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. + hostFileMapperInitOnce sync.Once +} + +// IncRef implements platform.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { + d.dataMu.Lock() + seg, gap := d.fdRefs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = d.fdRefs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + d.fdRefs.MergeAdjacent(fr) + d.dataMu.Unlock() + return + } + } +} + +// DecRef implements platform.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { + d.dataMu.Lock() + seg := d.fdRefs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = d.fdRefs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = d.fdRefs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + d.fdRefs.MergeAdjacent(fr) + d.dataMu.Unlock() + +} + +// MapInternal implements platform.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + d.handleMu.RLock() + bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) + d.handleMu.RUnlock() + return bs, err +} + +// FD implements platform.File.FD. +func (d *dentryPlatformFile) FD() int { + d.handleMu.RLock() + fd := d.handle.fd + d.handleMu.RUnlock() + return int(fd) +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go new file mode 100644 index 000000000..08c691c47 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -0,0 +1,159 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// specialFileFD implements vfs.FileDescriptionImpl for files other than +// regular files, directories, and symlinks: pipes, sockets, etc. It is also +// used for regular files when filesystemOptions.specialRegularFiles is in +// effect. specialFileFD differs from regularFileFD by using per-FD handles +// instead of shared per-dentry handles, and never buffering I/O. +type specialFileFD struct { + fileDescription + + // handle is immutable. + handle handle + + // off is the file offset. off is protected by mu. (POSIX 2.9.7 only + // requires operations using the file offset to be atomic for regular files + // and symlinks; however, since specialFileFD may be used for regular + // files, we apply this atomicity unconditionally.) + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *specialFileFD) Release() { + fd.handle.close(context.Background()) + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) + fs.syncMu.Lock() + delete(fs.specialFileFDs, fd) + fs.syncMu.Unlock() +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *specialFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.file.flush(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Going through dst.CopyOutFrom() holds MM locks around file operations of + // unknown duration. For regularFileFD, doing so is necessary to support + // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't + // hold here since specialFileFD doesn't client-cache data. Just buffer the + // read instead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchAtime(ctx, fd.vfsfd.Mount()) + } + buf := make([]byte, dst.NumBytes()) + n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if n == 0 { + return 0, err + } + if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { + return int64(cp), cperr + } + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Do a buffered write. See rationale in PRead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchCMtime(ctx) + } + buf := make([]byte, src.NumBytes()) + // Don't do partial writes if we get a partial read from src. + if _, err := src.CopyIn(ctx, buf); err != nil { + return 0, err + } + n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + return int64(n), err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not + // clear that file size is even meaningful for these files. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *specialFileFD) Sync(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.sync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go new file mode 100644 index 000000000..adf43be60 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func (d *dentry) isSymlink() bool { + return d.fileType() == linux.S_IFLNK +} + +// Precondition: d.isSymlink(). +func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if d.fs.opts.interop != InteropModeShared { + d.touchAtime(ctx, mnt) + d.dataMu.Lock() + if d.haveTarget { + target := d.target + d.dataMu.Unlock() + return target, nil + } + } + target, err := d.file.readlink(ctx) + if d.fs.opts.interop != InteropModeShared { + if err == nil { + d.haveTarget = true + d.target = target + } + d.dataMu.Unlock() + } + return target, err +} diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go new file mode 100644 index 000000000..7598ec6a8 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func dentryTimestampFromP9(s, ns uint64) int64 { + return int64(s*1e9 + ns) +} + +func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 { + return ts.Sec*1e9 + int64(ts.Nsec) +} + +func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { + return linux.StatxTimestamp{ + Sec: ns / 1e9, + Nsec: uint32(ns % 1e9), + } +} + +func nowFromContext(ctx context.Context) (int64, bool) { + if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { + return clock.Now().Nanoseconds(), true + } + return 0, false +} + +// Preconditions: fs.interop != InteropModeShared. +func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now, ok := nowFromContext(ctx) + if !ok { + mnt.EndWrite() + return + } + d.metadataMu.Lock() + atomic.StoreInt64(&d.atime, now) + d.metadataMu.Unlock() + mnt.EndWrite() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime(ctx context.Context) { + now, ok := nowFromContext(ctx) + if !ok { + return + } + d.metadataMu.Lock() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD new file mode 100644 index 000000000..731f192b3 --- /dev/null +++ b/pkg/sentry/fsimpl/host/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "host", + srcs = [ + "default_file.go", + "host.go", + "util.go", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/safemem", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go new file mode 100644 index 000000000..172cdb161 --- /dev/null +++ b/pkg/sentry/fsimpl/host/default_file.go @@ -0,0 +1,233 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "math" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files. +type defaultFileFD struct { + fileDescription + + // canMap specifies whether we allow the file to be memory mapped. + canMap bool + + // mu protects the fields below. + mu sync.Mutex + + // offset specifies the current file offset. + offset int64 +} + +// TODO(gvisor.dev/issue/1672): Implement Waitable interface. + +// PRead implements FileDescriptionImpl. +func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + return 0, syserror.ESPIPE + } + + return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags)) +} + +// Read implements FileDescriptionImpl. +func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + // These files can't be memory mapped, assert this. + if f.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") + } + + f.mu.Lock() + n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) + f.mu.Unlock() + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + f.mu.Lock() + n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags)) + f.offset += n + f.mu.Unlock() + return n, err +} + +func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { + if flags&^(linux.RWF_VALID) != 0 { + return 0, syserror.EOPNOTSUPP + } + + reader := safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv2(fd, srcs, offset, flags) + return int64(n), err + }, + } + n, err := dst.CopyOutFrom(ctx, reader) + return int64(n), err +} + +// PWrite implements FileDescriptionImpl. +func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + return 0, syserror.ESPIPE + } + + return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags)) +} + +// Write implements FileDescriptionImpl. +func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + // These files can't be memory mapped, assert this. + if f.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") + } + + f.mu.Lock() + n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) + f.mu.Unlock() + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + f.mu.Lock() + n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags)) + f.offset += n + f.mu.Unlock() + return n, err +} + +func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) { + if flags&^(linux.RWF_VALID) != 0 { + return 0, syserror.EOPNOTSUPP + } + + writer := safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev2(fd, srcs, offset, flags) + return int64(n), err + }, + } + n, err := src.CopyInTo(ctx, writer) + return int64(n), err +} + +// Seek implements FileDescriptionImpl. +// +// Note that we do not support seeking on directories, since we do not even +// allow directory fds to be imported at all. +func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) { + // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. + if f.inode.isStream { + return 0, syserror.ESPIPE + } + + f.mu.Lock() + defer f.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset = offset + + case linux.SEEK_CUR: + // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. + if offset > math.MaxInt64-f.offset { + return f.offset, syserror.EOVERFLOW + } + if f.offset+offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset += offset + + case linux.SEEK_END: + var s syscall.Stat_t + if err := syscall.Fstat(f.inode.hostFD, &s); err != nil { + return f.offset, err + } + size := s.Size + + // Check for overflow. Note that underflow cannot occur, since size >= 0. + if offset > math.MaxInt64-size { + return f.offset, syserror.EOVERFLOW + } + if size+offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset = size + offset + + case linux.SEEK_DATA, linux.SEEK_HOLE: + // Modifying the offset in the host file table should not matter, since + // this is the only place where we use it. + // + // For reading and writing, we always rely on our internal offset. + n, err := unix.Seek(f.inode.hostFD, offset, int(whence)) + if err != nil { + return f.offset, err + } + f.offset = n + + default: + // Invalid whence. + return f.offset, syserror.EINVAL + } + + return f.offset, nil +} + +// Sync implements FileDescriptionImpl. +func (f *defaultFileFD) Sync(context.Context) error { + // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. + return unix.Fsync(f.inode.hostFD) +} + +// ConfigureMMap implements FileDescriptionImpl. +func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { + if !f.canMap { + return syserror.ENODEV + } + // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. + return syserror.ENODEV +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go new file mode 100644 index 000000000..c205e6a0b --- /dev/null +++ b/pkg/sentry/fsimpl/host/host.go @@ -0,0 +1,286 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host provides a filesystem implementation for host files imported as +// file descriptors. +package host + +import ( + "errors" + "fmt" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// ImportFD sets up and returns a vfs.FileDescription from a donated fd. +func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { + // Must be importing to a mount of host.filesystem. + fs, ok := mnt.Filesystem().Impl().(*filesystem) + if !ok { + return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) + } + + // Retrieve metadata. + var s syscall.Stat_t + if err := syscall.Fstat(hostFD, &s); err != nil { + return nil, err + } + + fileMode := linux.FileMode(s.Mode) + fileType := fileMode.FileType() + // Pipes, character devices, and sockets can return EWOULDBLOCK for + // operations that would block. + isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK + + i := &inode{ + hostFD: hostFD, + isStream: isStream, + isTTY: isTTY, + ino: fs.NextIno(), + mode: fileMode, + uid: ownerUID, + gid: ownerGID, + } + + d := &kernfs.Dentry{} + d.Init(i) + // i.open will take a reference on d. + defer d.DecRef() + + return i.open(d.VFSDentry(), mnt) +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + // When the reference count reaches zero, the host fd is closed. + refs.AtomicRefCount + + // hostFD contains the host fd that this file was originally created from, + // which must be available at time of restore. + // + // This field is initialized at creation time and is immutable. + hostFD int + + // isStream is true if the host fd points to a file representing a stream, + // e.g. a socket or a pipe. Such files are not seekable and can return + // EWOULDBLOCK for I/O operations. + // + // This field is initialized at creation time and is immutable. + isStream bool + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool + + // ino is an inode number unique within this filesystem. + ino uint64 + + // mu protects the inode metadata below. + mu sync.Mutex + + // mode is the file mode of this inode. Note that this value may become out + // of date if the mode is changed on the host, e.g. with chmod. + mode linux.FileMode + + // uid and gid of the file owner. Note that these refer to the owner of the + // file created on import, not the fd on the host. + uid auth.KUID + gid auth.KGID +} + +// Note that these flags may become out of date, since they can be modified +// on the host, e.g. with fcntl. +func fileFlagsFromHostFD(fd int) (int, error) { + flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0) + if err != nil { + log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err) + return 0, err + } + // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. + flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND + return flags, nil +} + +// CheckPermissions implements kernfs.Inode. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid) +} + +// Mode implements kernfs.Inode. +func (i *inode) Mode() linux.FileMode { + return i.mode +} + +// Stat implements kernfs.Inode. +func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + var s unix.Statx_t + if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil { + return linux.Statx{}, err + } + ls := unixToLinuxStatx(s) + + // Use our own internal inode number and file owner. + // + // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. + // If we use the device number from the host, it may collide with another + // sentry-internal device number. We handle device/inode numbers without + // relying on the host to prevent collisions. + ls.Ino = i.ino + ls.UID = uint32(i.uid) + ls.GID = uint32(i.gid) + + // Update file mode from the host. + i.mode = linux.FileMode(ls.Mode) + + return ls, nil +} + +// SetStat implements kernfs.Inode. +func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + s := opts.Stat + + m := s.Mask + if m == 0 { + return nil + } + if m&(linux.STATX_UID|linux.STATX_GID) != 0 { + return syserror.EPERM + } + if m&linux.STATX_MODE != 0 { + if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { + return err + } + i.mode = linux.FileMode(s.Mode) + } + if m&linux.STATX_SIZE != 0 { + if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { + return err + } + } + if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { + timestamps := []unix.Timespec{ + toTimespec(s.Atime, m&linux.STATX_ATIME == 0), + toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), + } + if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil { + return err + } + } + return nil +} + +// DecRef implements kernfs.Inode. +func (i *inode) DecRef() { + i.AtomicRefCount.DecRefWithDestructor(i.Destroy) +} + +// Destroy implements kernfs.Inode. +func (i *inode) Destroy() { + if err := unix.Close(i.hostFD); err != nil { + log.Warningf("failed to close host fd %d: %v", i.hostFD, err) + } +} + +// Open implements kernfs.Inode. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.open(vfsd, rp.Mount()) +} + +func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { + + fileType := i.mode.FileType() + if fileType == syscall.S_IFSOCK { + if i.isTTY { + return nil, errors.New("cannot use host socket as TTY") + } + // TODO(gvisor.dev/issue/1672): support importing sockets. + return nil, errors.New("importing host sockets not supported") + } + + if i.isTTY { + // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. + return nil, errors.New("importing host fd as TTY not supported") + } + + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + fd := &defaultFileFD{ + fileDescription: fileDescription{ + inode: i, + }, + canMap: canMap(uint32(fileType)), + mu: sync.Mutex{}, + offset: 0, + } + + vfsfd := &fd.vfsfd + flags, err := fileFlagsFromHostFD(i.hostFD) + if err != nil { + return nil, err + } + + if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil +} + +// fileDescription is embedded by host fd implementations of FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but + // cached to reduce indirections and casting. fileDescription does not hold + // a reference on the inode through the inode field (since one is already + // held via the Dentry). + // + // inode is immutable after fileDescription creation. + inode *inode +} + +// SetStat implements vfs.FileDescriptionImpl. +func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error { + return f.inode.SetStat(nil, opts) +} + +// Stat implements vfs.FileDescriptionImpl. +func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { + return f.inode.Stat(nil, opts) +} + +// Release implements vfs.FileDescriptionImpl. +func (f *fileDescription) Release() { + // noop +} diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go new file mode 100644 index 000000000..e1ccacb4d --- /dev/null +++ b/pkg/sentry/fsimpl/host/util.go @@ -0,0 +1,86 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { + if omit { + return unix.Timespec{ + Sec: 0, + Nsec: unix.UTIME_OMIT, + } + } + return unix.Timespec{ + Sec: int64(ts.Sec), + Nsec: int64(ts.Nsec), + } +} + +func unixToLinuxStatx(s unix.Statx_t) linux.Statx { + return linux.Statx{ + Mask: s.Mask, + Blksize: s.Blksize, + Attributes: s.Attributes, + Nlink: s.Nlink, + UID: s.Uid, + GID: s.Gid, + Mode: s.Mode, + Ino: s.Ino, + Size: s.Size, + Blocks: s.Blocks, + AttributesMask: s.Attributes_mask, + Atime: unixToLinuxStatxTimestamp(s.Atime), + Btime: unixToLinuxStatxTimestamp(s.Btime), + Ctime: unixToLinuxStatxTimestamp(s.Ctime), + Mtime: unixToLinuxStatxTimestamp(s.Mtime), + RdevMajor: s.Rdev_major, + RdevMinor: s.Rdev_minor, + DevMajor: s.Dev_major, + DevMinor: s.Dev_minor, + } +} + +func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} +} + +// wouldBlock returns true for file types that can return EWOULDBLOCK +// for blocking operations, e.g. pipes, character devices, and sockets. +func wouldBlock(fileType uint32) bool { + return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK +} + +// canMap returns true if a file with fileType is allowed to be memory mapped. +// This is ported over from VFS1, but it's probably not the best way for us +// to check if a file can be memory mapped. +func canMap(fileType uint32) bool { + // TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()). + // + // TODO(b/38213152): Some obscure character devices can be mapped. + return fileType == syscall.S_IFREG +} + +// isBlockError checks if an error is EAGAIN or EWOULDBLOCK. +// If so, they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK +} diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 733792c78..1c026f4d8 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -53,9 +53,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil @@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) Release() {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() - return fd.inode.Stat(fs), nil + return fd.inode.Stat(fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 6104751c8..da821d524 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -43,12 +43,12 @@ type GenericDirectoryFD struct { } // Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error { - if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 { +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error { + if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } - if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err } fd.children = children @@ -107,17 +107,21 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent fs.mu.Lock() defer fs.mu.Unlock() + opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { - stat := fd.inode().Stat(vfsFS) + stat, err := fd.inode().Stat(vfsFS, opts) + if err != nil { + return err + } dirent := vfs.Dirent{ Name: ".", Type: linux.DT_DIR, Ino: stat.Ino, NextOff: 1, } - if !cb.Handle(dirent) { - return nil + if err := cb.Handle(dirent); err != nil { + return err } fd.off++ } @@ -125,15 +129,18 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode - stat := parentInode.Stat(vfsFS) + stat, err := parentInode.Stat(vfsFS, opts) + if err != nil { + return err + } dirent := vfs.Dirent{ Name: "..", Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: 2, } - if !cb.Handle(dirent) { - return nil + if err := cb.Handle(dirent); err != nil { + return err } fd.off++ } @@ -146,15 +153,18 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { inode := it.Dentry.Impl().(*Dentry).inode - stat := inode.Stat(vfsFS) + stat, err := inode.Stat(vfsFS, opts) + if err != nil { + return err + } dirent := vfs.Dirent{ Name: it.Name, Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: fd.off + 1, } - if !cb.Handle(dirent) { - return nil + if err := cb.Handle(dirent); err != nil { + return err } fd.off++ } @@ -190,7 +200,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() inode := fd.inode() - return inode.Stat(fs), nil + return inode.Stat(fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 9d65d0179..1d7e04ad4 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -// This file implements vfs.FilesystemImpl for kernfs. - package kernfs +// This file implements vfs.FilesystemImpl for kernfs. + import ( "fmt" @@ -111,10 +111,10 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Dentry isn't cached; it either doesn't exist or failed // revalidation. Attempt to resolve it via Lookup. // - // FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry, - // not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries - // in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl - // casts accordingly. + // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return + // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes + // that all dentries in the filesystem are (kernfs.)Dentry and performs + // vfs.DentryImpl casts accordingly. var err error childVFSD, err = parent.inode.Lookup(ctx, name) if err != nil { @@ -365,7 +365,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // appropriate bits in rp), but are returned by // FileDescriptionImpl.StatusFlags(). opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW - ats := vfs.AccessTypesForOpenFlags(opts.Flags) + ats := vfs.AccessTypesForOpenFlags(&opts) // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { @@ -379,7 +379,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts.Flags) + return inode.Open(rp, vfsd, opts) } // May create new file. @@ -398,7 +398,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts.Flags) + return inode.Open(rp, vfsd, opts) } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) @@ -438,7 +438,7 @@ afterTrailingSymlink: return nil, err } parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags) + return child.Impl().(*Dentry).inode.Open(rp, child, opts) } // Open existing file or follow symlink. if mustCreate { @@ -463,7 +463,7 @@ afterTrailingSymlink: if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return childInode.Open(rp, childVFSD, opts.Flags) + return childInode.Open(rp, childVFSD, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. @@ -544,6 +544,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() virtfs := rp.VirtualFilesystem() srcDirDentry := srcDirVFSD.Impl().(*Dentry) @@ -595,7 +596,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry := vfsd.Parent().Impl().(*Dentry) parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() - if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { @@ -630,7 +634,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } - return inode.Stat(fs.VFSFilesystem()), nil + return inode.Stat(fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. @@ -697,7 +701,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry := vfsd.Parent().Impl().(*Dentry) parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() - if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index adca2313f..d50018b18 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -36,20 +36,20 @@ type InodeNoopRefCount struct { } // IncRef implements Inode.IncRef. -func (n *InodeNoopRefCount) IncRef() { +func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. -func (n *InodeNoopRefCount) DecRef() { +func (InodeNoopRefCount) DecRef() { } // TryIncRef implements Inode.TryIncRef. -func (n *InodeNoopRefCount) TryIncRef() bool { +func (InodeNoopRefCount) TryIncRef() bool { return true } // Destroy implements Inode.Destroy. -func (n *InodeNoopRefCount) Destroy() { +func (InodeNoopRefCount) Destroy() { } // InodeDirectoryNoNewChildren partially implements the Inode interface. @@ -58,27 +58,27 @@ func (n *InodeNoopRefCount) Destroy() { type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. -func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewLink implements Inode.NewLink. -func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { return nil, syserror.EPERM } @@ -90,62 +90,62 @@ type InodeNotDirectory struct { } // HasChildren implements Inode.HasChildren. -func (*InodeNotDirectory) HasChildren() bool { +func (InodeNotDirectory) HasChildren() bool { return false } // NewFile implements Inode.NewFile. -func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. -func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. -func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. -func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. -func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. -func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. -func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. -func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { +func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. -func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { panic("Lookup called on non-directory inode") } // IterDirents implements Inode.IterDirents. -func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { +func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { panic("IterDirents called on non-directory inode") } // Valid implements Inode.Valid. -func (*InodeNotDirectory) Valid(context.Context) bool { +func (InodeNotDirectory) Valid(context.Context) bool { return true } @@ -157,17 +157,17 @@ func (*InodeNotDirectory) Valid(context.Context) bool { type InodeNoDynamicLookup struct{} // Lookup implements Inode.Lookup. -func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { return nil, syserror.ENOENT } // IterDirents implements Inode.IterDirents. -func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { +func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { return offset, nil } // Valid implements Inode.Valid. -func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { +func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { return true } @@ -177,7 +177,7 @@ func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (*InodeNotSymlink) Readlink(context.Context) (string, error) { +func (InodeNotSymlink) Readlink(context.Context) (string, error) { return "", syserror.EINVAL } @@ -219,7 +219,7 @@ func (a *InodeAttrs) Mode() linux.FileMode { // Stat partially implements Inode.Stat. Note that this function doesn't provide // all the stat fields, and the embedder should consider extending the result // with filesystem-specific fields. -func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { +func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK stat.Ino = atomic.LoadUint64(&a.ino) @@ -230,7 +230,7 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { // TODO: Implement other stat fields like timestamps. - return stat + return stat, nil } // SetStat implements Inode.SetStat. @@ -507,7 +507,7 @@ type InodeSymlink struct { } // Open implements Inode.Open. -func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -549,8 +549,8 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F } // Open implements kernfs.Inode. -func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 79ebea8a5..a8ab2a2ba 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -176,8 +176,6 @@ type Dentry struct { vfsd vfs.Dentry inode Inode - refs uint64 - // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. flags uint32 @@ -302,8 +300,9 @@ type Inode interface { // this inode. The returned file description should hold a reference on the // inode for its lifetime. // - // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. - Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) + // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing + // the inode on which Open() is being called. + Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } type inodeRefs interface { @@ -328,7 +327,7 @@ type inodeMetadata interface { // Stat returns the metadata for this inode. This corresponds to // vfs.FilesystemImpl.StatAt. - Stat(fs *vfs.Filesystem) linux.Statx + Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to // vfs.FilesystemImpl.SetStatAt. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index ee65cf491..0459fb305 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - v := vfs.New() + v := &vfs.VirtualFilesystem{} + if err := v.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -113,9 +116,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod return &dir.dentry } -func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { return nil, err } return fd.VFSFileDescription(), nil @@ -143,9 +146,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &dir.dentry } -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 12aac2e6a..a83245866 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -14,6 +14,7 @@ go_library( "tasks_net.go", "tasks_sys.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 11477b6a9..5c19d5522 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -26,15 +26,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -// procFSType is the factory class for procfs. +// Name is the default filesystem name. +const Name = "proc" + +// FilesystemType is the factory class for procfs. // // +stateify savable -type procFSType struct{} +type FilesystemType struct{} -var _ vfs.FilesystemType = (*procFSType)(nil) +var _ vfs.FilesystemType = (*FilesystemType)(nil) // GetFilesystem implements vfs.FilesystemType. -func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, nil, fmt.Errorf("procfs requires a kernel") @@ -47,12 +50,13 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile procfs := &kernfs.Filesystem{} procfs.VFSFilesystem().Init(vfsObj, procfs) - var data *InternalData + var cgroups map[string]string if opts.InternalData != nil { - data = opts.InternalData.(*InternalData) + data := opts.InternalData.(*InternalData) + cgroups = data.Cgroups } - _, dentry := newTasksInode(procfs, k, pidns, data.Cgroups) + _, dentry := newTasksInode(procfs, k, pidns, cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 353e37195..611645f3f 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -105,8 +105,8 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb Ino: i.inoGen.NextIno(), NextOff: offset + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } @@ -114,15 +114,20 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb } // Open implements kernfs.Inode. -func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } // Stat implements kernfs.Inode. -func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { - stat := i.InodeAttrs.Stat(vsfs) - stat.Nlink += uint32(i.task.ThreadGroup().Count()) - return stat +func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(vsfs, opts) + if err != nil { + return linux.Statx{}, err + } + if opts.Mask&linux.STATX_NLINK != 0 { + stat.Nlink += uint32(i.task.ThreadGroup().Count()) + } + return stat, nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index eb5bc62c0..c0d643f51 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), }), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) @@ -98,9 +100,9 @@ func (i *taskInode) Valid(ctx context.Context) bool { } // Open implements kernfs.Inode. -func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } @@ -152,12 +154,21 @@ func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, childre } // Stat implements kernfs.Inode. -func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { - stat := i.Inode.Stat(fs) - uid, gid := i.getOwner(linux.FileMode(stat.Mode)) - stat.UID = uint32(uid) - stat.GID = uint32(gid) - return stat +func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.Inode.Stat(fs, opts) + if err != nil { + return linux.Statx{}, err + } + if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { + uid, gid := i.getOwner(linux.FileMode(stat.Mode)) + if opts.Mask&linux.STATX_UID != 0 { + stat.UID = uint32(uid) + } + if opts.Mask&linux.STATX_GID != 0 { + stat.GID = uint32(gid) + } + } + return stat, nil } // CheckPermissions implements kernfs.Inode. @@ -234,7 +245,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr // member, there is one entry containing three colon-separated fields: // hierarchy-ID:controller-list:cgroup-path" func newCgroupData(controllers map[string]string) dynamicInode { - buf := bytes.Buffer{} + var buf bytes.Buffer // The hierarchy ids must be positive integers (for cgroup v1), but the // exact number does not matter, so long as they are unique. We can diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index efd3b3453..5a231ac86 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) return nil } + +// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file. +// +// +stateify savable +type oomScoreAdj struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { + adj, err := o.task.OOMScoreAdj() + if err != nil { + return err + } + fmt.Fprintf(buf, "%d\n", adj) + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if err := o.task.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 14bd334e8..b1e39c82f 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), "net": newNetDir(root, inoGen, k), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}), } inode := &tasksInode{ @@ -151,8 +151,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback Ino: i.inoGen.NextIno(), NextOff: offset + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } @@ -163,8 +163,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback Ino: i.inoGen.NextIno(), NextOff: offset + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } @@ -196,8 +196,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback Ino: i.inoGen.NextIno(), NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } @@ -205,23 +205,28 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback } // Open implements kernfs.Inode. -func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } -func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { - stat := i.InodeAttrs.Stat(vsfs) +func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(vsfs, opts) + if err != nil { + return linux.Statx{}, err + } - // Add dynamic children to link count. - for _, tg := range i.pidns.ThreadGroups() { - if leader := tg.Leader(); leader != nil { - stat.Nlink++ + if opts.Mask&linux.STATX_NLINK != 0 { + // Add dynamic children to link count. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + stat.Nlink++ + } } } - return stat + return stat, nil } func cpuInfoData(k *kernel.Kernel) string { diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 608fec017..d4e1812d8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -39,7 +39,10 @@ import ( func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry - if stack := k.NetworkStack(); stack != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. We should make this per-process, + // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index c7ce74883..3d5dc463c 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry - if stack := k.NetworkStack(); stack != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 6fc3524db..0eb401619 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -63,21 +63,23 @@ var ( "thread-self": threadSelfLink.NextOff, } taskStaticFiles = map[string]testutil.DirentType{ - "auxv": linux.DT_REG, - "cgroup": linux.DT_REG, - "cmdline": linux.DT_REG, - "comm": linux.DT_REG, - "environ": linux.DT_REG, - "gid_map": linux.DT_REG, - "io": linux.DT_REG, - "maps": linux.DT_REG, - "ns": linux.DT_DIR, - "smaps": linux.DT_REG, - "stat": linux.DT_REG, - "statm": linux.DT_REG, - "status": linux.DT_REG, - "task": linux.DT_DIR, - "uid_map": linux.DT_REG, + "auxv": linux.DT_REG, + "cgroup": linux.DT_REG, + "cmdline": linux.DT_REG, + "comm": linux.DT_REG, + "environ": linux.DT_REG, + "gid_map": linux.DT_REG, + "io": linux.DT_REG, + "maps": linux.DT_REG, + "ns": linux.DT_DIR, + "oom_score": linux.DT_REG, + "oom_score_adj": linux.DT_REG, + "smaps": linux.DT_REG, + "stat": linux.DT_REG, + "statm": linux.DT_REG, + "status": linux.DT_REG, + "task": linux.DT_DIR, + "uid_map": linux.DT_REG, } ) @@ -90,8 +92,7 @@ func setup(t *testing.T) *testutil.System { ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) fsOpts := vfs.GetFilesystemOptions{ @@ -102,11 +103,11 @@ func setup(t *testing.T) *testutil.System { }, }, } - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } - return testutil.NewSystem(ctx, t, vfsObj, mntns) + return testutil.NewSystem(ctx, t, k.VFS(), mntns) } func TestTasksEmpty(t *testing.T) { @@ -131,7 +132,7 @@ func TestTasks(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -213,7 +214,7 @@ func TestTasksOffset(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) for i := 0; i < 3; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil { t.Fatalf("CreateTask(): %v", err) } } @@ -337,7 +338,7 @@ func TestTask(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - _, err := testutil.CreateTask(s.Ctx, "name", tc) + _, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -352,7 +353,7 @@ func TestProcSelf(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, "name", tc) + task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -433,7 +434,7 @@ func TestTree(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 66c0d8bc8..a741e2bb6 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "sys.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index e35d52d17..c36c4fa11 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -28,6 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the default filesystem name. +const Name = "sysfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -97,9 +100,9 @@ func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { } // Open implements kernfs.Inode.Open. -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 8b1cf0bd0..4b3602d47 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System { } ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - v := vfs.New() - v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{}) + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } - return testutil.NewSystem(ctx, t, v, mns) + return testutil.NewSystem(ctx, t, k.VFS(), mns) } func TestReadCPUFile(t *testing.T) { diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index efd5974c4..e4f36f4ae 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -16,7 +16,7 @@ go_library( "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", - "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/sched", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 89f8c4915..488478e29 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" @@ -33,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" // Platforms are plugable. _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" @@ -99,36 +100,41 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } - ctx := k.SupervisorContext() + kernel.VFS2Enabled = true - // Create mount namespace without root as it's the minimum required to create - // the global thread group. - mntns, err := fs.NewMountNamespace(ctx, nil) - if err != nil { - return nil, err + if err := k.VFS().Init(); err != nil { + return nil, fmt.Errorf("VFS init: %v", err) } + k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + ls, err := limits.NewLinuxLimitSet() if err != nil { return nil, err } - tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) k.TestOnly_SetGlobalInit(tg) return k, nil } // CreateTask creates a new bare bones task for tests. -func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { +func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) config := &kernel.TaskConfig{ Kernel: k, ThreadGroup: tc, TaskContext: &kernel.TaskContext{Name: name}, Credentials: auth.CredentialsFromContext(ctx), + NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), UTSNamespace: kernel.UTSNamespaceFromContext(ctx), IPCNamespace: kernel.IPCNamespaceFromContext(ctx), AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + MountNamespaceVFS2: mntns, + FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), } return k.TaskSet().NewTask(config) } diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 1c98335c1..e16808c63 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -41,12 +41,12 @@ type System struct { Creds *auth.Credentials VFS *vfs.VirtualFilesystem Root vfs.VirtualDentry - mns *vfs.MountNamespace + MntNs *vfs.MountNamespace } // NewSystem constructs a System. // -// Precondition: Caller must hold a reference on mns, whose ownership +// Precondition: Caller must hold a reference on MntNs, whose ownership // is transferred to the new System. func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { s := &System{ @@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns Ctx: ctx, Creds: auth.CredentialsFromContext(ctx), VFS: v, - mns: mns, + MntNs: mns, Root: mns.Root(), } return s @@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System { Ctx: s.Ctx, Creds: s.Creds, VFS: s.VFS, - mns: s.mns, + MntNs: s.MntNs, Root: s.Root, } } @@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { Ctx: ctx, Creds: s.Creds, VFS: s.VFS, - mns: s.mns, + MntNs: s.MntNs, Root: s.Root, } } @@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { s.Root.DecRef() - s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem. + s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. @@ -226,7 +226,7 @@ func (d *DirentCollector) SkipDotsChecks(value bool) { } // Handle implements vfs.IterDirentsCallback.Handle. -func (d *DirentCollector) Handle(dirent vfs.Dirent) bool { +func (d *DirentCollector) Handle(dirent vfs.Dirent) error { d.mu.Lock() if d.dirents == nil { d.dirents = make(map[string]*vfs.Dirent) @@ -234,7 +234,7 @@ func (d *DirentCollector) Handle(dirent vfs.Dirent) bool { d.order = append(d.order, &dirent) d.dirents[dirent.Name] = &dirent d.mu.Unlock() - return true + return nil } // Count returns the number of dirents currently in the collector. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index c61366224..57abd5583 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -38,6 +38,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", @@ -47,6 +48,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -86,6 +88,7 @@ go_test( "//pkg/context", "//pkg/fspath", "//pkg/sentry/contexttest", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 54241c8e8..383133e44 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -175,7 +175,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -183,7 +186,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -366,7 +369,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -374,7 +380,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index dc0d27cf9..b4380af38 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -74,25 +74,25 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba defer fs.mu.Unlock() if fd.off == 0 { - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, Ino: vfsd.Impl().(*dentry).inode.ino, NextOff: 1, - }) { - return nil + }); err != nil { + return err } fd.off++ } if fd.off == 1 { parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), Ino: parentInode.ino, NextOff: 2, - }) { - return nil + }); err != nil { + return err } fd.off++ } @@ -111,14 +111,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba for child != nil { // Skip other directoryFD iterators. if child.inode != nil { - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: child.vfsd.Name(), Type: child.inode.direntType(), Ino: child.inode.ino, NextOff: fd.off + 1, - }) { + }); err != nil { dir.childList.InsertBefore(child, fd.iter) - return nil + return err } fd.off++ } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5ee9cf1e9..e1b551422 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -16,7 +16,6 @@ package tmpfs import ( "fmt" - "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -334,7 +333,7 @@ afterTrailingSymlink: } func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(opts.Flags) + ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { return nil, err @@ -347,10 +346,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, err } if opts.Flags&linux.O_TRUNC != 0 { - impl.mu.Lock() - impl.data.Truncate(0, impl.memFile) - atomic.StoreUint64(&impl.size, 0) - impl.mu.Unlock() + if _, err := impl.truncate(0); err != nil { + return nil, err + } } return &fd.vfsfd, nil case *directory: @@ -486,7 +484,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa vfsObj := rp.VirtualFilesystem() oldParentDir := oldParent.inode.impl.(*directory) newParentDir := newParent.inode.impl.(*directory) - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil { return err } if replaced != nil { @@ -543,7 +543,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { return err } parent.inode.impl.(*directory).childList.Remove(child) @@ -622,7 +624,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if child.inode.isDir() { return syserror.EISDIR } - if !rp.MustBeDir() { + if rp.MustBeDir() { return syserror.ENOTDIR } mnt := rp.Mount() @@ -631,7 +633,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { return err } parent.inode.impl.(*directory).childList.Remove(child) diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 5ee7f2a72..1614f2c39 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index e9e6faf67..711442424 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -15,6 +15,7 @@ package tmpfs import ( + "fmt" "io" "math" "sync/atomic" @@ -22,7 +23,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -33,25 +36,53 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// regularFile is a regular (=S_IFREG) tmpfs file. type regularFile struct { inode inode // memFile is a platform.File used to allocate pages to this regularFile. memFile *pgalloc.MemoryFile - // mu protects the fields below. - mu sync.RWMutex + // mapsMu protects mappings. + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the file into memmap.MappingSpaces. + // + // Protected by mapsMu. + mappings memmap.MappingSet + + // writableMappingPages tracks how many pages of virtual memory are mapped + // as potentially writable from this file. If a page has multiple mappings, + // each mapping is counted separately. + // + // This counter is susceptible to overflow as we can potentially count + // mappings from many VMAs. We count pages rather than bytes to slightly + // mitigate this. + // + // Protected by mapsMu. + writableMappingPages uint64 + + // dataMu protects the fields below. + dataMu sync.RWMutex // data maps offsets into the file to offsets into memFile that store // the file's data. + // + // Protected by dataMu. data fsutil.FileRangeSet - // size is the size of data, but accessed using atomic memory - // operations to avoid locking in inode.stat(). - size uint64 - // seals represents file seals on this inode. + // + // Protected by dataMu. seals uint32 + + // size is the size of data. + // + // Protected by both dataMu and inode.mu; reading it requires holding + // either mutex, while writing requires holding both AND using atomics. + // Readers that do not require consistency (like Stat) may read the + // value atomically without holding either lock. + size uint64 } func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { @@ -65,39 +96,170 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. -func (rf *regularFile) truncate(size uint64) (bool, error) { - rf.mu.Lock() - defer rf.mu.Unlock() +func (rf *regularFile) truncate(newSize uint64) (bool, error) { + rf.inode.mu.Lock() + defer rf.inode.mu.Unlock() + return rf.truncateLocked(newSize) +} - if size == rf.size { +// Preconditions: rf.inode.mu must be held. +func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { + oldSize := rf.size + if newSize == oldSize { // Nothing to do. return false, nil } - if size > rf.size { - // Growing the file. + // Need to hold inode.mu and dataMu while modifying size. + rf.dataMu.Lock() + if newSize > oldSize { + // Can we grow the file? if rf.seals&linux.F_SEAL_GROW != 0 { - // Seal does not allow growth. + rf.dataMu.Unlock() return false, syserror.EPERM } - rf.size = size + // We only need to update the file size. + atomic.StoreUint64(&rf.size, newSize) + rf.dataMu.Unlock() return true, nil } - // Shrinking the file + // We are shrinking the file. First check if this is allowed. if rf.seals&linux.F_SEAL_SHRINK != 0 { - // Seal does not allow shrink. + rf.dataMu.Unlock() return false, syserror.EPERM } - // TODO(gvisor.dev/issues/1197): Invalidate mappings once we have - // mappings. + // Update the file size. + atomic.StoreUint64(&rf.size, newSize) + rf.dataMu.Unlock() + + // Invalidate past translations of truncated pages. + oldpgend := fs.OffsetPageEnd(int64(oldSize)) + newpgend := fs.OffsetPageEnd(int64(newSize)) + if newpgend < oldpgend { + rf.mapsMu.Lock() + rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/shmem.c:shmem_setattr() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + rf.mapsMu.Unlock() + } - rf.data.Truncate(size, rf.memFile) - rf.size = size + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them. + rf.dataMu.Lock() + rf.data.Truncate(newSize, rf.memFile) + rf.dataMu.Unlock() return true, nil } +// AddMapping implements memmap.Mappable.AddMapping. +func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + // Reject writable mapping if F_SEAL_WRITE is set. + if rf.seals&linux.F_SEAL_WRITE != 0 && writable { + return syserror.EPERM + } + + rf.mappings.AddMapping(ms, ar, offset, writable) + if writable { + pagesBefore := rf.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize) + + if rf.writableMappingPages < pagesBefore { + panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) + } + } + + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + + rf.mappings.RemoveMapping(ms, ar, offset, writable) + + if writable { + pagesBefore := rf.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize) + + if rf.writableMappingPages > pagesBefore { + panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) + } + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return rf.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + rf.dataMu.Lock() + defer rf.dataMu.Unlock() + + // Constrain translations to f.attr.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(int64(rf.size)) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + // Newly-allocated pages are zeroed, so we don't need to do anything. + return dsts.NumBytes(), nil + }) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + ts = append(ts, memmap.Translation{ + Source: segMR, + File: rf.memFile, + Offset: seg.FileRangeOf(segMR).Start, + Perms: usermem.AnyAccess, + }) + translatedEnd = segMR.End + } + + // Don't return the error returned by f.data.Fill if it occurred outside of + // required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (*regularFile) InvalidateUnsavable(context.Context) error { + return nil +} + type regularFileFD struct { fileDescription @@ -151,8 +313,10 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Overflow. return 0, syserror.EFBIG } + f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) + f.inode.mu.Unlock() putRegularFileReadWriter(rw) return n, err } @@ -192,6 +356,34 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { return nil } +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { + return fd.inode().lockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { + fd.inode().unlockBSD(uid) + return nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { + return fd.inode().lockPOSIX(uid, t, rng, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { + fd.inode().unlockPOSIX(uid, rng) + return nil +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + file := fd.inode().impl.(*regularFile) + return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts) +} + // regularFileReadWriter implements safemem.Reader and Safemem.Writer. type regularFileReadWriter struct { file *regularFile @@ -221,14 +413,15 @@ func putRegularFileReadWriter(rw *regularFileReadWriter) { // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - rw.file.mu.RLock() + rw.file.dataMu.RLock() + defer rw.file.dataMu.RUnlock() + size := rw.file.size // Compute the range to read (limited by file size and overflow-checked). - if rw.off >= rw.file.size { - rw.file.mu.RUnlock() + if rw.off >= size { return 0, io.EOF } - end := rw.file.size + end := size if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { end = rend } @@ -242,7 +435,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er // Get internal mappings. ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) if err != nil { - rw.file.mu.RUnlock() return done, err } @@ -252,7 +444,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er rw.off += uint64(n) dsts = dsts.DropFirst64(n) if err != nil { - rw.file.mu.RUnlock() return done, err } @@ -268,7 +459,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er rw.off += uint64(n) dsts = dsts.DropFirst64(n) if err != nil { - rw.file.mu.RUnlock() return done, err } @@ -276,13 +466,16 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } - rw.file.mu.RUnlock() return done, nil } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: inode.mu must be held. func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - rw.file.mu.Lock() + // Hold dataMu so we can modify size. + rw.file.dataMu.Lock() + defer rw.file.dataMu.Unlock() // Compute the range to write (overflow-checked). end := rw.off + srcs.NumBytes() @@ -293,7 +486,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, // Check if seals prevent either file growth or all writes. switch { case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed - rw.file.mu.Unlock() return 0, syserror.EPERM case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would @@ -315,7 +507,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, } if end <= rw.off { // Truncation would result in no data being written. - rw.file.mu.Unlock() return 0, syserror.EPERM } } @@ -372,9 +563,8 @@ exitLoop: // If the write ends beyond the file's previous size, it causes the // file to grow. if rw.off > rw.file.size { - atomic.StoreUint64(&rw.file.size, rw.off) + rw.file.size = rw.off } - rw.file.mu.Unlock() return done, retErr } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 32552e261..0399725cf 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -24,9 +24,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -38,7 +40,11 @@ var nextFileID int64 func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -49,7 +55,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr root := mntns.Root() return vfsObj, root, func() { root.DecRef() - mntns.DecRef(vfsObj) + mntns.DecRef() }, nil } @@ -260,6 +266,60 @@ func TestPWrite(t *testing.T) { } } +func TestLocks(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + var ( + uid1 lock.UniqueID + uid2 lock.UniqueID + // Non-blocking. + block lock.Blocker + ) + + uid1 = 123 + uid2 = 456 + + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { + t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + + rng1 := lock.LockRange{0, 1} + rng2 := lock.LockRange{1, 2} + + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil { + t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) + } +} + func TestPRead(t *testing.T) { ctx := contexttest.Context(t) fd, cleanup, err := newFileFD(ctx, 0644) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 88dbd6e35..521206305 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -18,9 +18,10 @@ // Lock order: // // filesystem.mu -// regularFileFD.offMu -// regularFile.mu // inode.mu +// regularFileFD.offMu +// regularFile.mapsMu +// regularFile.dataMu package tmpfs import ( @@ -30,14 +31,19 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the default filesystem name. +const Name = "tmpfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -153,6 +159,9 @@ type inode struct { rdevMajor uint32 rdevMinor uint32 + // Advisory file locks, which lock at the inode level. + locks lock.FileLocks + impl interface{} // immutable } @@ -218,12 +227,15 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - // This is unnecessary; it's mostly to simulate what tmpfs would do. if regFile, ok := i.impl.(*regularFile); ok { - regFile.mu.Lock() + // Hold inode.mu and regFile.dataMu while mutating + // size. + i.mu.Lock() + regFile.dataMu.Lock() regFile.data.DropAll(regFile.memFile) atomic.StoreUint64(®File.size, 0) - regFile.mu.Unlock() + regFile.dataMu.Unlock() + i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") @@ -312,7 +324,7 @@ func (i *inode) setStat(stat linux.Statx) error { if mask&linux.STATX_SIZE != 0 { switch impl := i.impl.(type) { case *regularFile: - updated, err := impl.truncate(stat.Size) + updated, err := impl.truncateLocked(stat.Size) if err != nil { return err } @@ -352,6 +364,44 @@ func (i *inode) setStat(stat linux.Statx) error { return nil } +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + switch i.impl.(type) { + case *regularFile: + return i.locks.LockBSD(uid, t, block) + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) unlockBSD(uid fslock.UniqueID) error { + switch i.impl.(type) { + case *regularFile: + i.locks.UnlockBSD(uid) + return nil + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + switch i.impl.(type) { + case *regularFile: + return i.locks.LockPOSIX(uid, t, rng, block) + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { + switch i.impl.(type) { + case *regularFile: + i.locks.UnlockPOSIX(uid, rng) + return nil + } + return syserror.EBADF +} + // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 334432abf..07bf39fed 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -10,6 +10,7 @@ go_library( srcs = [ "context.go", "inet.go", + "namespace.go", "test_stack.go", ], deps = [ diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index a7dfb78a7..2916a0644 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -28,6 +28,10 @@ type Stack interface { // interface indexes to a slice of associated interface address properties. InterfaceAddrs() map[int32][]InterfaceAddr + // AddInterfaceAddr adds an address to the network interface identified by + // index. + AddInterfaceAddr(idx int32, addr InterfaceAddr) error + // SupportsIPv6 returns true if the stack supports IPv6 connectivity. SupportsIPv6() bool diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go new file mode 100644 index 000000000..029af3025 --- /dev/null +++ b/pkg/sentry/inet/namespace.go @@ -0,0 +1,102 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package inet + +// Namespace represents a network namespace. See network_namespaces(7). +// +// +stateify savable +type Namespace struct { + // stack is the network stack implementation of this network namespace. + stack Stack `state:"nosave"` + + // creator allows kernel to create new network stack for network namespaces. + // If nil, no networking will function if network is namespaced. + // + // At afterLoad(), creator will be used to create network stack. Stateify + // needs to wait for this field to be loaded before calling afterLoad(). + creator NetworkStackCreator `state:"wait"` + + // isRoot indicates whether this is the root network namespace. + isRoot bool +} + +// NewRootNamespace creates the root network namespace, with creator +// allowing new network namespaces to be created. If creator is nil, no +// networking will function if the network is namespaced. +func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace { + return &Namespace{ + stack: stack, + creator: creator, + isRoot: true, + } +} + +// NewNamespace creates a new network namespace from the root. +func NewNamespace(root *Namespace) *Namespace { + n := &Namespace{ + creator: root.creator, + } + n.init() + return n +} + +// Stack returns the network stack of n. Stack may return nil if no network +// stack is configured. +func (n *Namespace) Stack() Stack { + return n.stack +} + +// IsRoot returns whether n is the root network namespace. +func (n *Namespace) IsRoot() bool { + return n.isRoot +} + +// RestoreRootStack restores the root network namespace with stack. This should +// only be called when restoring kernel. +func (n *Namespace) RestoreRootStack(stack Stack) { + if !n.isRoot { + panic("RestoreRootStack can only be called on root network namespace") + } + if n.stack != nil { + panic("RestoreRootStack called after a stack has already been set") + } + n.stack = stack +} + +func (n *Namespace) init() { + // Root network namespace will have stack assigned later. + if n.isRoot { + return + } + if n.creator != nil { + var err error + n.stack, err = n.creator.CreateStack() + if err != nil { + panic(err) + } + } +} + +// afterLoad is invoked by stateify. +func (n *Namespace) afterLoad() { + n.init() +} + +// NetworkStackCreator allows new instances of a network stack to be created. It +// is used by the kernel to create new network namespaces when requested. +type NetworkStackCreator interface { + // CreateStack creates a new network stack for a network namespace. + CreateStack() (Stack, error) +} diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index dcfcbd97e..d8961fc94 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -47,6 +47,12 @@ func (s *TestStack) InterfaceAddrs() map[int32][]InterfaceAddr { return s.InterfaceAddrsMap } +// AddInterfaceAddr implements Stack.AddInterfaceAddr. +func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error { + s.InterfaceAddrsMap[idx] = append(s.InterfaceAddrsMap[idx], addr) + return nil +} + // SupportsIPv6 implements Stack.SupportsIPv6. func (s *TestStack) SupportsIPv6() bool { return s.SupportsIPv6Flag diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a27628c0a..beba29a09 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -91,6 +91,7 @@ go_library( "fs_context.go", "ipc_namespace.go", "kernel.go", + "kernel_opts.go", "kernel_state.go", "pending_signals.go", "pending_signals_list.go", @@ -156,6 +157,7 @@ go_library( "//pkg/context", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fspath", "//pkg/log", "//pkg/metric", "//pkg/refs", @@ -166,6 +168,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", + "//pkg/sentry/fsbridge", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", @@ -198,6 +201,7 @@ go_library( "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index a0d35d350..8e9f200d0 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -38,11 +38,14 @@ func (e *EventPoll) afterLoad() { } } - for it := e.waitingList.Front(); it != nil; it = it.Next() { - if it.id.File.Readiness(it.mask) != 0 { - e.waitingList.Remove(it) - e.readyList.PushBack(it) - it.curList = &e.readyList + for it := e.waitingList.Front(); it != nil; { + entry := it + it = it.Next() + + if entry.id.File.Readiness(entry.mask) != 0 { + e.waitingList.Remove(entry) + e.readyList.PushBack(entry) + entry.curList = &e.readyList e.Notify(waiter.EventIn) } } diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 23b88f7a6..58001d56c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return fds, nil } +// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for +// the given file description. If it succeeds, it takes a reference on file. +func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + if minfd < 0 { + // Don't accept negative FDs. + return -1, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if minfd >= end { + return -1, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + fd := minfd + if fd < f.next { + fd = f.next + } + for fd < end { + if d, _, _ := f.get(fd); d == nil { + f.setVFS2(fd, file, flags) + if fd == f.next { + // Update next search start position. + f.next = fd + 1 + } + return fd, nil + } + fd++ + } + return -1, syscall.EMFILE +} + // NewFDAt sets the file reference for the given FD. If there is an active // reference for that FD, the ref count for that existing reference is // decremented. @@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 return syscall.EBADF } - f.mu.Lock() - defer f.mu.Unlock() - // Check the limit for the provided file. if limitSet := limits.FromContext(ctx); limitSet != nil { if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { @@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 } // Install the entry. + f.mu.Lock() + defer f.mu.Unlock() f.setAll(fd, file, fileVFS2, flags) return nil } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 2448c1d99..47f78df9a 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -37,10 +38,16 @@ type FSContext struct { // destroyed. root *fs.Dirent + // rootVFS2 is the filesystem root. + rootVFS2 vfs.VirtualDentry + // cwd is the current working directory. Will be nil iff the FSContext // has been destroyed. cwd *fs.Dirent + // cwdVFS2 is the current working directory. + cwdVFS2 vfs.VirtualDentry + // umask is the current file mode creation mask. When a thread using this // context invokes a syscall that creates a file, bits set in umask are // removed from the permissions that the file is created with. @@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { return &f } +// NewFSContextVFS2 returns a new filesystem context. +func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + f := FSContext{ + rootVFS2: root, + cwdVFS2: cwd, + umask: umask, + } + f.EnableLeakCheck("kernel.FSContext") + return &f +} + // destroy is the destructor for an FSContext. // // This will call DecRef on both root and cwd Dirents. If either call to @@ -75,11 +95,17 @@ func (f *FSContext) destroy() { f.mu.Lock() defer f.mu.Unlock() - f.root.DecRef() - f.root = nil - - f.cwd.DecRef() - f.cwd = nil + if VFS2Enabled { + f.rootVFS2.DecRef() + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef() + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef() + f.root = nil + f.cwd.DecRef() + f.cwd = nil + } } // DecRef implements RefCounter.DecRef with destructor f.destroy. @@ -93,12 +119,21 @@ func (f *FSContext) DecRef() { func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() - f.cwd.IncRef() - f.root.IncRef() + + if VFS2Enabled { + f.cwdVFS2.IncRef() + f.rootVFS2.IncRef() + } else { + f.cwd.IncRef() + f.root.IncRef() + } + return &FSContext{ - cwd: f.cwd, - root: f.root, - umask: f.umask, + cwd: f.cwd, + root: f.root, + cwdVFS2: f.cwdVFS2, + rootVFS2: f.rootVFS2, + umask: f.umask, } } @@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext { func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() - if f.cwd != nil { - f.cwd.IncRef() - } + + f.cwd.IncRef() return f.cwd } +// WorkingDirectoryVFS2 returns the current working directory. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.cwdVFS2.IncRef() + return f.cwdVFS2 +} + // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // @@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old.DecRef() } +// SetWorkingDirectoryVFS2 sets the current working directory. +// This will take an extra reference on the VirtualDentry. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { + f.mu.Lock() + defer f.mu.Unlock() + + old := f.cwdVFS2 + f.cwdVFS2 = d + d.IncRef() + old.DecRef() +} + // RootDirectory returns the current filesystem root. // // This will return nil if called after destroy(), otherwise it will return a @@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent { return f.root } +// RootDirectoryVFS2 returns the current filesystem root. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.rootVFS2.IncRef() + return f.rootVFS2 +} + // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // @@ -172,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old.DecRef() } +// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { + if !vd.Ok() { + panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") + } + + f.mu.Lock() + + if !f.rootVFS2.Ok() { + f.mu.Unlock() + panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd)) + } + + old := f.rootVFS2 + vd.IncRef() + f.rootVFS2 = vd + f.mu.Unlock() + old.DecRef() +} + // Umask returns the current umask. func (f *FSContext) Umask() uint { f.mu.Lock() diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index dcd6e91c4..1d627564f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -43,11 +43,13 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -71,6 +73,10 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" ) +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow +// easy access everywhere. To be removed once VFS2 becomes the default. +var VFS2Enabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // @@ -105,7 +111,7 @@ type Kernel struct { timekeeper *Timekeeper tasks *TaskSet rootUserNamespace *auth.UserNamespace - networkStack inet.Stack `state:"nosave"` + rootNetworkNamespace *inet.Namespace applicationCores uint useHostCores bool extraAuxv []arch.AuxEntry @@ -235,6 +241,16 @@ type Kernel struct { // events. This is initialized lazily on the first unimplemented // syscall. unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` + + // SpecialOpts contains special kernel options. + SpecialOpts + + // VFS keeps the filesystem state used across the kernel. + vfs vfs.VirtualFilesystem + + // If set to true, report address space activation waits as if the task is in + // external wait so that the watchdog doesn't report the task stuck. + SleepForAddressSpaceActivation bool } // InitKernelArgs holds arguments to Init. @@ -248,8 +264,9 @@ type InitKernelArgs struct { // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace - // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. - NetworkStack inet.Stack + // RootNetworkNamespace is the root network namespace. If nil, no networking + // will be available. + RootNetworkNamespace *inet.Namespace // ApplicationCores is the number of logical CPUs visible to sandboxed // applications. The set of logical CPU IDs is [0, ApplicationCores); thus @@ -308,7 +325,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.rootUTSNamespace = args.RootUTSNamespace k.rootIPCNamespace = args.RootIPCNamespace k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace - k.networkStack = args.NetworkStack + k.rootNetworkNamespace = args.RootNetworkNamespace + if k.rootNetworkNamespace == nil { + k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil) + } k.applicationCores = args.ApplicationCores if args.UseHostCores { k.useHostCores = true @@ -531,8 +551,6 @@ func (ts *TaskSet) unregisterEpollWaiters() { func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { loadStart := time.Now() - k.networkStack = net - initAppCores := k.applicationCores // Load the pre-saved CPUID FeatureSet. @@ -563,6 +581,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) log.Infof("Kernel load stats: %s", &stats) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + // rootNetworkNamespace should be populated after loading the state file. + // Restore the root network stack. + k.rootNetworkNamespace.RestoreRootStack(net) + // Load the memory file's state. memoryStart := time.Now() if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { @@ -621,7 +643,7 @@ type CreateProcessArgs struct { // File is a passed host FD pointing to a file to load as the init binary. // // This is checked if and only if Filename is "". - File *fs.File + File fsbridge.File // Argvv is a list of arguments. Argv []string @@ -670,6 +692,13 @@ type CreateProcessArgs struct { // increment it). MountNamespace *fs.MountNamespace + // MountNamespaceVFS2 optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. + // increment it). + MountNamespaceVFS2 *vfs.MountNamespace + // ContainerID is the container that the process belongs to. ContainerID string } @@ -708,13 +737,26 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.Credentials case fs.CtxRoot: if ctx.args.MountNamespace != nil { - // MountNamespace.Root() will take a reference on the root - // dirent for us. + // MountNamespace.Root() will take a reference on the root dirent for us. return ctx.args.MountNamespace.Root() } return nil + case vfs.CtxRoot: + if ctx.args.MountNamespaceVFS2 == nil { + return nil + } + // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. + return ctx.args.MountNamespaceVFS2.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2 takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: @@ -754,34 +796,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) - // Grab the mount namespace. - mounts := args.MountNamespace - if mounts == nil { - mounts = k.GlobalInit().Leader().MountNamespace() - mounts.IncRef() - } - - tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) ctx := args.NewContext(k) - // Get the root directory from the MountNamespace. - root := mounts.Root() - // The call to newFSContext below will take a reference on root, so we - // don't need to hold this one. - defer root.DecRef() - - // Grab the working directory. - remainingTraversals := uint(args.MaxSymlinkTraversals) - wd := root // Default. - if args.WorkingDirectory != "" { - var err error - wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) - if err != nil { - return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + var ( + opener fsbridge.Lookup + fsContext *FSContext + mntns *fs.MountNamespace + ) + + if VFS2Enabled { + mntnsVFS2 := args.MountNamespaceVFS2 + if mntnsVFS2 == nil { + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() } - defer wd.DecRef() + // Get the root directory from the MountNamespace. + root := args.MountNamespaceVFS2.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + pop := vfs.PathOperation{ + Root: root, + Start: wd, + Path: fspath.Parse(args.WorkingDirectory), + FollowFinalSymlink: true, + } + var err error + wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) + fsContext = NewFSContextVFS2(root, wd, args.Umask) + + } else { + mntns = args.MountNamespace + if mntns == nil { + mntns = k.GlobalInit().Leader().MountNamespace() + mntns.IncRef() + } + // Get the root directory from the MountNamespace. + root := mntns.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + remainingTraversals := args.MaxSymlinkTraversals + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewFSLookup(mntns, root, wd) + fsContext = newFSContext(root, wd, args.Umask) } + tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + // Check which file to start from. switch { case args.Filename != "": @@ -802,11 +887,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, } // Create a fresh task context. - remainingTraversals = uint(args.MaxSymlinkTraversals) + remainingTraversals := args.MaxSymlinkTraversals loadArgs := loader.LoadArgs{ - Mounts: mounts, - Root: root, - WorkingDirectory: wd, + Opener: opener, RemainingTraversals: &remainingTraversals, ResolveFinal: true, Filename: args.Filename, @@ -831,13 +914,15 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, Kernel: k, ThreadGroup: tg, TaskContext: tc, - FSContext: newFSContext(root, wd, args.Umask), + FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, + NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, AbstractSocketNamespace: args.AbstractSocketNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, ContainerID: args.ContainerID, } t, err := k.tasks.NewTask(config) @@ -1097,6 +1182,14 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { k.sendExternalSignal(info, context) } +// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. +// This function doesn't skip signals like SendExternalSignal does. +func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error { + k.extMu.Lock() + defer k.extMu.Unlock() + return tg.SendSignal(info) +} + // SendContainerSignal sends the given signal to all processes inside the // namespace that match the given container ID. func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { @@ -1175,10 +1268,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { return k.rootAbstractSocketNamespace } -// NetworkStack returns the network stack. NetworkStack may return nil if no -// network stack is available. -func (k *Kernel) NetworkStack() inet.Stack { - return k.networkStack +// RootNetworkNamespace returns the root network namespace, always non-nil. +func (k *Kernel) RootNetworkNamespace() *inet.Namespace { + return k.rootNetworkNamespace } // GlobalInit returns the thread group with ID 1 in the root PID namespace, or @@ -1375,8 +1467,24 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return ctx.k.globalInit.mounts.Root() } return nil + case vfs.CtxRoot: + if ctx.k.globalInit == nil { + return vfs.VirtualDentry{} + } + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + defer mntns.DecRef() + // Root() takes a reference on the root dirent for us. + return mntns.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2() takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter + case inet.CtxStack: + return ctx.k.RootNetworkNamespace().Stack() case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: @@ -1420,3 +1528,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { Registers: t.Arch().StateData().Proto(), }) } + +// VFS returns the virtual filesystem for the kernel. +func (k *Kernel) VFS() *vfs.VirtualFilesystem { + return &k.vfs +} diff --git a/pkg/sentry/kernel/kernel_opts.go b/pkg/sentry/kernel/kernel_opts.go new file mode 100644 index 000000000..2e66ec587 --- /dev/null +++ b/pkg/sentry/kernel/kernel_opts.go @@ -0,0 +1,20 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +// SpecialOpts contains non-standard options for the kernel. +// +// +stateify savable +type SpecialOpts struct{} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 4c049d5b4..f29dc0472 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -1,25 +1,10 @@ load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -go_template_instance( - name = "buffer_list", - out = "buffer_list.go", - package = "pipe", - prefix = "buffer", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*buffer", - "Linker": "*buffer", - }, -) - go_library( name = "pipe", srcs = [ - "buffer.go", - "buffer_list.go", "device.go", "node.go", "pipe.go", @@ -33,8 +18,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/buffer", "//pkg/context", - "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", @@ -51,7 +36,6 @@ go_test( name = "pipe_test", size = "small", srcs = [ - "buffer_test.go", "node_test.go", "pipe_test.go", ], diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go deleted file mode 100644 index fe3be5dbd..000000000 --- a/pkg/sentry/kernel/pipe/buffer.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pipe - -import ( - "io" - - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sync" -) - -// buffer encapsulates a queueable byte buffer. -// -// Note that the total size is slightly less than two pages. This -// is done intentionally to ensure that the buffer object aligns -// with runtime internals. We have no hard size or alignment -// requirements. This two page size will effectively minimize -// internal fragmentation, but still have a large enough chunk -// to limit excessive segmentation. -// -// +stateify savable -type buffer struct { - data [8144]byte - read int - write int - bufferEntry -} - -// Reset resets internal data. -// -// This must be called before use. -func (b *buffer) Reset() { - b.read = 0 - b.write = 0 -} - -// Empty indicates the buffer is empty. -// -// This indicates there is no data left to read. -func (b *buffer) Empty() bool { - return b.read == b.write -} - -// Full indicates the buffer is full. -// -// This indicates there is no capacity left to write. -func (b *buffer) Full() bool { - return b.write == len(b.data) -} - -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. -func (b *buffer) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.write:])) - n, err := safemem.CopySeq(dst, srcs) - b.write += int(n) - return n, err -} - -// WriteFromReader writes to the buffer from an io.Reader. -func (b *buffer) WriteFromReader(r io.Reader, count int64) (int64, error) { - dst := b.data[b.write:] - if count < int64(len(dst)) { - dst = b.data[b.write:][:count] - } - n, err := r.Read(dst) - b.write += n - return int64(n), err -} - -// ReadToBlocks implements safemem.Reader.ReadToBlocks. -func (b *buffer) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(b.data[b.read:b.write])) - n, err := safemem.CopySeq(dsts, src) - b.read += int(n) - return n, err -} - -// ReadToWriter reads from the buffer into an io.Writer. -func (b *buffer) ReadToWriter(w io.Writer, count int64, dup bool) (int64, error) { - src := b.data[b.read:b.write] - if count < int64(len(src)) { - src = b.data[b.read:][:count] - } - n, err := w.Write(src) - if !dup { - b.read += n - } - return int64(n), err -} - -// bufferPool is a pool for buffers. -var bufferPool = sync.Pool{ - New: func() interface{} { - return new(buffer) - }, -} - -// newBuffer grabs a new buffer from the pool. -func newBuffer() *buffer { - b := bufferPool.Get().(*buffer) - b.Reset() - return b -} diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 08410283f..725e9db7d 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -20,6 +20,7 @@ import ( "sync/atomic" "syscall" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" @@ -70,10 +71,10 @@ type Pipe struct { // mu protects all pipe internal state below. mu sync.Mutex `state:"nosave"` - // data is the buffer queue of pipe contents. + // view is the underlying set of buffers. // // This is protected by mu. - data bufferList + view buffer.View // max is the maximum size of the pipe in bytes. When this max has been // reached, writers will get EWOULDBLOCK. @@ -81,11 +82,6 @@ type Pipe struct { // This is protected by mu. max int64 - // size is the current size of the pipe in bytes. - // - // This is protected by mu. - size int64 - // hadWriter indicates if this pipe ever had a writer. Note that this // does not necessarily indicate there is *currently* a writer, just // that there has been a writer at some point since the pipe was @@ -196,7 +192,7 @@ type readOps struct { limit func(int64) // read performs the actual read operation. - read func(*buffer) (int64, error) + read func(*buffer.View) (int64, error) } // read reads data from the pipe into dst and returns the number of bytes @@ -213,7 +209,7 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { defer p.mu.Unlock() // Is the pipe empty? - if p.size == 0 { + if p.view.Size() == 0 { if !p.HasWriters() { // There are no writers, return EOF. return 0, nil @@ -222,71 +218,13 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { } // Limit how much we consume. - if ops.left() > p.size { - ops.limit(p.size) + if ops.left() > p.view.Size() { + ops.limit(p.view.Size()) } - done := int64(0) - for ops.left() > 0 { - // Pop the first buffer. - first := p.data.Front() - if first == nil { - break - } - - // Copy user data. - n, err := ops.read(first) - done += int64(n) - p.size -= n - - // Empty buffer? - if first.Empty() { - // Push to the free list. - p.data.Remove(first) - bufferPool.Put(first) - } - - // Handle errors. - if err != nil { - return done, err - } - } - - return done, nil -} - -// dup duplicates all data from this pipe into the given writer. -// -// There is no blocking behavior implemented here. The writer may propagate -// some blocking error. All the writes must be complete writes. -func (p *Pipe) dup(ctx context.Context, ops readOps) (int64, error) { - p.mu.Lock() - defer p.mu.Unlock() - - // Is the pipe empty? - if p.size == 0 { - if !p.HasWriters() { - // See above. - return 0, nil - } - return 0, syserror.ErrWouldBlock - } - - // Limit how much we consume. - if ops.left() > p.size { - ops.limit(p.size) - } - - done := int64(0) - for buf := p.data.Front(); buf != nil; buf = buf.Next() { - n, err := ops.read(buf) - done += n - if err != nil { - return done, err - } - } - - return done, nil + // Copy user data; the read op is responsible for trimming. + done, err := ops.read(&p.view) + return done, err } type writeOps struct { @@ -297,7 +235,7 @@ type writeOps struct { limit func(int64) // write should write to the provided buffer. - write func(*buffer) (int64, error) + write func(*buffer.View) (int64, error) } // write writes data from sv into the pipe and returns the number of bytes @@ -317,33 +255,19 @@ func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { // POSIX requires that a write smaller than atomicIOBytes (PIPE_BUF) be // atomic, but requires no atomicity for writes larger than this. wanted := ops.left() - if avail := p.max - p.size; wanted > avail { + if avail := p.max - p.view.Size(); wanted > avail { if wanted <= p.atomicIOBytes { return 0, syserror.ErrWouldBlock } ops.limit(avail) } - done := int64(0) - for ops.left() > 0 { - // Need a new buffer? - last := p.data.Back() - if last == nil || last.Full() { - // Add a new buffer to the data list. - last = newBuffer() - p.data.PushBack(last) - } - - // Copy user data. - n, err := ops.write(last) - done += int64(n) - p.size += n - - // Handle errors. - if err != nil { - return done, err - } + // Copy user data. + done, err := ops.write(&p.view) + if err != nil { + return done, err } + if wanted > done { // Partial write due to full pipe. return done, syserror.ErrWouldBlock @@ -396,7 +320,7 @@ func (p *Pipe) HasWriters() bool { // Precondition: mu must be held. func (p *Pipe) rReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasReaders() && p.data.Front() != nil { + if p.HasReaders() && p.view.Size() != 0 { ready |= waiter.EventIn } if !p.HasWriters() && p.hadWriter { @@ -422,7 +346,7 @@ func (p *Pipe) rReadiness() waiter.EventMask { // Precondition: mu must be held. func (p *Pipe) wReadinessLocked() waiter.EventMask { ready := waiter.EventMask(0) - if p.HasWriters() && p.size < p.max { + if p.HasWriters() && p.view.Size() < p.max { ready |= waiter.EventOut } if !p.HasReaders() { @@ -451,7 +375,7 @@ func (p *Pipe) rwReadiness() waiter.EventMask { func (p *Pipe) queued() int64 { p.mu.Lock() defer p.mu.Unlock() - return p.size + return p.view.Size() } // FifoSize implements fs.FifoSizer.FifoSize. @@ -474,7 +398,7 @@ func (p *Pipe) SetFifoSize(size int64) (int64, error) { } p.mu.Lock() defer p.mu.Unlock() - if size < p.size { + if size < p.view.Size() { return 0, syserror.EBUSY } p.max = size diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 80158239e..5a1d4fd57 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" @@ -49,9 +50,10 @@ func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) limit: func(l int64) { dst = dst.TakeFirst64(l) }, - read: func(buf *buffer) (int64, error) { - n, err := dst.CopyOutFrom(ctx, buf) + read: func(view *buffer.View) (int64, error) { + n, err := dst.CopyOutFrom(ctx, view) dst = dst.DropFirst64(n) + view.TrimFront(n) return n, err }, }) @@ -70,16 +72,15 @@ func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) limit: func(l int64) { count = l }, - read: func(buf *buffer) (int64, error) { - n, err := buf.ReadToWriter(w, count, dup) + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToWriter(w, count) + if !dup { + view.TrimFront(n) + } count -= n return n, err }, } - if dup { - // There is no notification for dup operations. - return p.dup(ctx, ops) - } n, err := p.read(ctx, ops) if n > 0 { p.Notify(waiter.EventOut) @@ -96,8 +97,8 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) limit: func(l int64) { src = src.TakeFirst64(l) }, - write: func(buf *buffer) (int64, error) { - n, err := src.CopyInTo(ctx, buf) + write: func(view *buffer.View) (int64, error) { + n, err := src.CopyInTo(ctx, view) src = src.DropFirst64(n) return n, err }, @@ -117,8 +118,8 @@ func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, e limit: func(l int64) { count = l }, - write: func(buf *buffer) (int64, error) { - n, err := buf.WriteFromReader(r, count) + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromReader(r, count) count -= n return n, err }, diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index efebfd872..ded95f532 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -303,26 +303,14 @@ func (t *Task) rseqAddrInterrupt() { return } - buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection) - if _, err := t.CopyInBytes(critAddr, buf); err != nil { + var cs linux.RSeqCriticalSection + if err := cs.CopyIn(t, critAddr); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } - // Manually marshal RSeqCriticalSection as this is in the hot path when - // rseq is enabled. It must be as fast as possible. - // - // TODO(b/130243041): Replace with go_marshal. - cs := linux.RSeqCriticalSection{ - Version: usermem.ByteOrder.Uint32(buf[0:4]), - Flags: usermem.ByteOrder.Uint32(buf[4:8]), - Start: usermem.ByteOrder.Uint64(buf[8:16]), - PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]), - Abort: usermem.ByteOrder.Uint64(buf[24:32]), - } - if cs.Version != 0 { t.Debugf("Unknown version in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 981e8c7fe..c0dbbe890 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -424,6 +425,11 @@ type Task struct { // abstractSockets is protected by mu. abstractSockets *AbstractSocketNamespace + // mountNamespaceVFS2 is the task's mount namespace. + // + // It is protected by mu. It is owned by the task goroutine. + mountNamespaceVFS2 *vfs.MountNamespace + // parentDeathSignal is sent to this task's thread group when its parent exits. // // parentDeathSignal is protected by mu. @@ -481,13 +487,10 @@ type Task struct { numaPolicy int32 numaNodeMask uint64 - // If netns is true, the task is in a non-root network namespace. Network - // namespaces aren't currently implemented in full; being in a network - // namespace simply prevents the task from observing any network devices - // (including loopback) or using abstract socket addresses (see unix(7)). + // netns is the task's network namespace. netns is never nil. // - // netns is protected by mu. netns is owned by the task goroutine. - netns bool + // netns is protected by mu. + netns *inet.Namespace // If rseqPreempted is true, before the next call to p.Switch(), // interrupt rseq critical regions as defined by rseqAddr and @@ -552,6 +555,13 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time + + // oomScoreAdj is the task's OOM score adjustment. This is currently not + // used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is protected by mu, and is owned by the task goroutine. + oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -638,6 +648,11 @@ func (t *Task) Value(key interface{}) interface{} { return int32(t.ThreadGroup().ID()) case fs.CtxRoot: return t.fsContext.RootDirectory() + case vfs.CtxRoot: + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 case fs.CtxDirentCacheLimiter: return t.k.DirentCacheLimiter case inet.CtxStack: @@ -701,6 +716,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) IsChrooted() bool { + if VFS2Enabled { + realRoot := t.mountNamespaceVFS2.Root() + defer realRoot.DecRef() + root := t.fsContext.RootDirectoryVFS2() + defer root.DecRef() + return root != realRoot + } + realRoot := t.tg.mounts.Root() defer realRoot.DecRef() root := t.fsContext.RootDirectory() @@ -774,6 +797,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) return fds[0], nil } +// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable.Get. +func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + return t.fdTable.NewFDVFS2(t, fd, file, flags) +} + // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. // // This automatically passes the task as the context. @@ -783,6 +815,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { return t.fdTable.NewFDAt(t, fd, file, flags) } +// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error { + return t.fdTable.NewFDAtVFS2(t, fd, file, flags) +} + // WithMuLocked executes f with t.mu locked. func (t *Task) WithMuLocked(f func(*Task)) { t.mu.Lock() @@ -796,6 +837,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace { return t.tg.mounts } +// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the +// returned mount namespace. +func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { + t.mu.Lock() + defer t.mu.Unlock() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 +} + // AbstractSockets returns t's AbstractSocketNamespace. func (t *Task) AbstractSockets() *AbstractSocketNamespace { return t.abstractSockets @@ -805,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace { func (t *Task) ContainerID() string { return t.containerID } + +// OOMScoreAdj gets the task's OOM score adjustment. +func (t *Task) OOMScoreAdj() (int32, error) { + t.mu.Lock() + defer t.mu.Unlock() + if t.ExitState() == TaskExitDead { + return 0, syserror.ESRCH + } + return t.oomScoreAdj, nil +} + +// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be +// between -1000 and 1000 inclusive. +func (t *Task) SetOOMScoreAdj(adj int32) error { + t.mu.Lock() + defer t.mu.Unlock() + if t.ExitState() == TaskExitDead { + return syserror.ESRCH + } + if adj > 1000 || adj < -1000 { + return syserror.EINVAL + } + t.oomScoreAdj = adj + return nil +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 53d4d211b..dda502bb8 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -54,8 +55,7 @@ type SharingOptions struct { NewUserNamespace bool // If NewNetworkNamespace is true, the task should have an independent - // network namespace. (Note that network namespaces are not really - // implemented; see comment on Task.netns for details.) + // network namespace. NewNetworkNamespace bool // If NewFiles is true, the task should use an independent file descriptor @@ -199,6 +199,17 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ipcns = NewIPCNamespace(userns) } + netns := t.NetworkNamespace() + if opts.NewNetworkNamespace { + netns = inet.NewNamespace(netns) + } + + // TODO(b/63601033): Implement CLONE_NEWNS. + mntnsVFS2 := t.mountNamespaceVFS2 + if mntnsVFS2 != nil { + mntnsVFS2.IncRef() + } + tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err @@ -241,7 +252,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { rseqAddr := usermem.Addr(0) rseqSignature := uint32(0) if opts.NewThreadGroup { - tg.mounts.IncRef() + if tg.mounts != nil { + tg.mounts.IncRef() + } sh := t.tg.signalHandlers if opts.NewSignalHandlers { sh = sh.Fork() @@ -251,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { rseqSignature = t.rseqSignature } + adj, err := t.OOMScoreAdj() + if err != nil { + return 0, nil, err + } + cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -260,23 +278,22 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), - NetworkNamespaced: t.netns, + NetworkNamespace: netns, AllowedCPUMask: t.CPUMask(), UTSNamespace: utsns, IPCNamespace: ipcns, AbstractSocketNamespace: t.abstractSockets, + MountNamespaceVFS2: mntnsVFS2, RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), + OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t } else { cfg.InheritParent = t } - if opts.NewNetworkNamespace { - cfg.NetworkNamespaced = true - } nt, err := t.tg.pidns.owner.NewTask(cfg) if err != nil { if opts.NewThreadGroup { @@ -473,7 +490,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { t.mu.Unlock() return syserror.EPERM } - t.netns = true + t.netns = inet.NewNamespace(t.netns) } if opts.NewUTSNamespace { if !haveCapSysAdmin { diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 2d6e7733c..0158b1788 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -136,11 +136,11 @@ func (t *Task) Stack() *arch.Stack { func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { // If File is not nil, we should load that instead of resolving Filename. if args.File != nil { - args.Filename = args.File.MappedName(ctx) + args.Filename = args.File.PathnameWithDeleted(ctx) } // Prepare a new user address space to load into. - m := mm.NewMemoryManager(k, k) + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) defer m.DecUsers(ctx) args.MemoryManager = m diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 8f57a34a6..00c425cca 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -220,7 +220,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.mu.Unlock() t.unstopVforkParent() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. - t.MemoryManager().Activate() + t.MemoryManager().Activate(t) t.ptraceExec(oldTID) return (*runSyscallExit)(nil) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 435761e5a..c4ade6e8e 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState { t.fsContext.DecRef() t.fdTable.DecRef() + t.mu.Lock() + if t.mountNamespaceVFS2 != nil { + t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2 = nil + } + t.mu.Unlock() + // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 41259210c..eeccaa197 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -32,21 +32,21 @@ const ( // Infof logs an formatted info message by calling log.Infof. func (t *Task) Infof(fmt string, v ...interface{}) { if log.IsLogging(log.Info) { - log.Infof(t.logPrefix.Load().(string)+fmt, v...) + log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) } } // Warningf logs a warning string by calling log.Warningf. func (t *Task) Warningf(fmt string, v ...interface{}) { if log.IsLogging(log.Warning) { - log.Warningf(t.logPrefix.Load().(string)+fmt, v...) + log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) } } // Debugf creates a debug string that includes the task ID. func (t *Task) Debugf(fmt string, v ...interface{}) { if log.IsLogging(log.Debug) { - log.Debugf(t.logPrefix.Load().(string)+fmt, v...) + log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...) } } @@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) { if !trace.IsEnabled() { return } - d := tc.MemoryManager.Executable() - if d == nil { + file := tc.MemoryManager.Executable() + if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer d.DecRef() - root := t.fsContext.RootDirectory() - if root == nil { - trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") - return - } - defer root.DecRef() - n, _ := d.FullName(root) - trace.Logf(t.traceContext, traceCategory, "exec: %s", n) + defer file.DecRef() + trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go index 172a31e1d..f7711232c 100644 --- a/pkg/sentry/kernel/task_net.go +++ b/pkg/sentry/kernel/task_net.go @@ -22,14 +22,23 @@ import ( func (t *Task) IsNetworkNamespaced() bool { t.mu.Lock() defer t.mu.Unlock() - return t.netns + return !t.netns.IsRoot() } // NetworkContext returns the network stack used by the task. NetworkContext // may return nil if no network stack is available. +// +// TODO(gvisor.dev/issue/1833): Migrate callers of this method to +// NetworkNamespace(). func (t *Task) NetworkContext() inet.Stack { - if t.IsNetworkNamespaced() { - return nil - } - return t.k.networkStack + t.mu.Lock() + defer t.mu.Unlock() + return t.netns.Stack() +} + +// NetworkNamespace returns the network namespace observed by the task. +func (t *Task) NetworkNamespace() *inet.Namespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 5568c91bc..799cbcd93 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -126,13 +126,39 @@ func (t *Task) doStop() { } } +func (*runApp) handleCPUIDInstruction(t *Task) error { + if len(arch.CPUIDInstruction) == 0 { + // CPUID emulation isn't supported, but this code can be + // executed, because the ptrace platform returns + // ErrContextSignalCPUID on page faults too. Look at + // pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more + // details. + return platform.ErrContextSignal + } + // Is this a CPUID instruction? + region := trace.StartRegion(t.traceContext, cpuidRegion) + expected := arch.CPUIDInstruction[:] + found := make([]byte, len(expected)) + _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + if err == nil && bytes.Equal(expected, found) { + // Skip the cpuid instruction. + t.Arch().CPUIDEmulate(t) + t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + region.End() + + return nil + } + region.End() // Not an actual CPUID, but required copy-in. + return platform.ErrContextSignal +} + // The runApp state checks for interrupts before executing untrusted // application code. // // +stateify savable type runApp struct{} -func (*runApp) execute(t *Task) taskRunState { +func (app *runApp) execute(t *Task) taskRunState { if t.interrupted() { // Checkpointing instructs tasks to stop by sending an interrupt, so we // must check for stops before entering runInterrupt (instead of @@ -237,21 +263,10 @@ func (*runApp) execute(t *Task) taskRunState { return (*runApp)(nil) case platform.ErrContextSignalCPUID: - // Is this a CPUID instruction? - region := trace.StartRegion(t.traceContext, cpuidRegion) - expected := arch.CPUIDInstruction[:] - found := make([]byte, len(expected)) - _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) - if err == nil && bytes.Equal(expected, found) { - // Skip the cpuid instruction. - t.Arch().CPUIDEmulate(t) - t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) - region.End() - + if err := app.handleCPUIDInstruction(t); err == nil { // Resume execution. return (*runApp)(nil) } - region.End() // Not an actual CPUID, but required copy-in. // The instruction at the given RIP was not a CPUID, and we // fallthrough to the default signal deliver behavior below. diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index de838beef..2bbf48bb8 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -17,10 +17,12 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -64,9 +66,8 @@ type TaskConfig struct { // Niceness is the niceness of the new task. Niceness int - // If NetworkNamespaced is true, the new task should observe a non-root - // network namespace. - NetworkNamespaced bool + // NetworkNamespace is the network namespace to be used for the new task. + NetworkNamespace *inet.Namespace // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet @@ -80,6 +81,9 @@ type TaskConfig struct { // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. AbstractSocketNamespace *AbstractSocketNamespace + // MountNamespaceVFS2 is the MountNamespace of the new task. + MountNamespaceVFS2 *vfs.MountNamespace + // RSeqAddr is a pointer to the the userspace linux.RSeq structure. RSeqAddr usermem.Addr @@ -89,6 +93,9 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string + + // oomScoreAdj is the task's OOM score adjustment. + OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -116,28 +123,30 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { parent: cfg.Parent, children: make(map[*Task]struct{}), }, - runState: (*runApp)(nil), - interruptChan: make(chan struct{}, 1), - signalMask: cfg.SignalMask, - signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, - tc: *tc, - fsContext: cfg.FSContext, - fdTable: cfg.FDTable, - p: cfg.Kernel.Platform.NewContext(), - k: cfg.Kernel, - ptraceTracees: make(map[*Task]struct{}), - allowedCPUMask: cfg.AllowedCPUMask.Copy(), - ioUsage: &usage.IO{}, - niceness: cfg.Niceness, - netns: cfg.NetworkNamespaced, - utsns: cfg.UTSNamespace, - ipcns: cfg.IPCNamespace, - abstractSockets: cfg.AbstractSocketNamespace, - rseqCPU: -1, - rseqAddr: cfg.RSeqAddr, - rseqSignature: cfg.RSeqSignature, - futexWaiter: futex.NewWaiter(), - containerID: cfg.ContainerID, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalMask: cfg.SignalMask, + signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + tc: *tc, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, + p: cfg.Kernel.Platform.NewContext(), + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + niceness: cfg.Niceness, + netns: cfg.NetworkNamespace, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + abstractSockets: cfg.AbstractSocketNamespace, + mountNamespaceVFS2: cfg.MountNamespaceVFS2, + rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, + futexWaiter: futex.NewWaiter(), + containerID: cfg.ContainerID, + oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 2bf3ce8a8..b02044ad2 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -30,7 +30,7 @@ var MAX_RW_COUNT = int(usermem.Addr(math.MaxInt32).RoundDown()) // Activate ensures that the task has an active address space. func (t *Task) Activate() { if mm := t.MemoryManager(); mm != nil { - if err := mm.Activate(); err != nil { + if err := mm.Activate(t); err != nil { panic("unable to activate mm: " + err.Error()) } } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 768e958d2..268f62e9d 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -256,7 +256,7 @@ type ThreadGroup struct { tty *TTY } -// NewThreadGroup returns a new, empty thread group in PID namespace ns. The +// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. @@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() { for _, it := range its { it.DestroyTimer() } - tg.mounts.DecRef() + if tg.mounts != nil { + tg.mounts.DecRef() + } } // forEachChildThreadGroupLocked indicates over all child ThreadGroups. diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 23790378a..c6aa65f28 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -33,6 +33,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/memmap", @@ -40,6 +41,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 122ed05c2..616fafa2c 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -97,11 +97,11 @@ type elfInfo struct { // accepts from the ELF, and it doesn't parse unnecessary parts of the file. // // ctx may be nil if f does not need it. -func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { +func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) { // Check ident first; it will tell us the endianness of the rest of the // structs. var ident [elf.EI_NIDENT]byte - _, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0) + _, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0) if err != nil { log.Infof("Error reading ELF ident: %v", err) // The entire ident array always exists. @@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { var hdr elf.Header64 hdrBuf := make([]byte, header64Size) - _, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0) + _, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0) if err != nil { log.Infof("Error reading ELF header: %v", err) // The entire header always exists. @@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { } phdrBuf := make([]byte, totalPhdrSize) - _, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) + _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) if err != nil { log.Infof("Error reading ELF phdrs: %v", err) // If phdrs were specified, they should all exist. @@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { // mapSegment maps a phdr into the Task. offset is the offset to apply to // phdr.Vaddr. -func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error { +func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error { // We must make a page-aligned mapping. adjust := usermem.Addr(phdr.Vaddr).PageOffset() @@ -395,7 +395,7 @@ type loadedELF struct { // // Preconditions: // * f is an ELF file -func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { +func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { first := true var start, end usermem.Addr var interpreter string @@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el } path := make([]byte, phdr.Filesz) - _, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off)) + _, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off)) if err != nil { // If an interpreter was specified, it should exist. ctx.Infof("Error reading PT_INTERP path: %v", err) @@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el // Preconditions: // * f is an ELF file // * f is the first ELF loaded into m -func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) { +func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) { info, err := parseHeader(ctx, f) if err != nil { ctx.Infof("Failed to parse initial ELF: %v", err) @@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS // // Preconditions: // * f is an ELF file -func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) { +func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) { info, err := parseHeader(ctx, f) if err != nil { if err == syserror.ENOEXEC { @@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error // Refresh the traversal limit. *args.RemainingTraversals = linux.MaxSymlinkTraversals args.Filename = bin.interpreter - d, i, err := openPath(ctx, args) + intFile, err := openPath(ctx, args) if err != nil { ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer i.DecRef() - // We don't need the Dirent. - d.DecRef() + defer intFile.DecRef() - interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin) + interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { ctx.Infof("Error loading interpreter: %v", err) return loadedELF{}, nil, err diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index 098a45d36..3886b4d33 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -19,7 +19,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -37,9 +37,9 @@ const ( ) // parseInterpreterScript returns the interpreter path and argv. -func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) { +func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) { line := make([]byte, interpMaxLineLength) - n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0) + n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0) // Short read is OK. if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 9a613d6b7..d6675b8f0 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -20,7 +20,6 @@ import ( "fmt" "io" "path" - "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" @@ -29,8 +28,10 @@ import ( "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -41,16 +42,6 @@ type LoadArgs struct { // MemoryManager is the memory manager to load the executable into. MemoryManager *mm.MemoryManager - // Mounts is the mount namespace in which to look up Filename. - Mounts *fs.MountNamespace - - // Root is the root directory under which to look up Filename. - Root *fs.Dirent - - // WorkingDirectory is the working directory under which to look up - // Filename. - WorkingDirectory *fs.Dirent - // RemainingTraversals is the maximum number of symlinks to follow to // resolve Filename. This counter is passed by reference to keep it // updated throughout the call stack. @@ -65,7 +56,12 @@ type LoadArgs struct { // File is an open fs.File object of the executable. If File is not // nil, then File will be loaded and Filename will be ignored. - File *fs.File + // + // The caller is responsible for checking that the user can execute this file. + File fsbridge.File + + // Opener is used to open the executable file when 'File' is nil. + Opener fsbridge.Lookup // CloseOnExec indicates that the executable (or one of its parent // directories) was opened with O_CLOEXEC. If the executable is an @@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in // installed in the Task FDTable. The caller takes ownership of both. // // args.Filename must be a readable, executable, regular file. -func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) { +func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) { if args.Filename == "" { ctx.Infof("cannot open empty name") - return nil, nil, syserror.ENOENT - } - - var d *fs.Dirent - var err error - if args.ResolveFinal { - d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) - } else { - d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) - } - if err != nil { - return nil, nil, err - } - // Defer a DecRef for the sake of failure cases. - defer d.DecRef() - - if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) { - return nil, nil, syserror.ELOOP - } - - if err := checkPermission(ctx, d); err != nil { - return nil, nil, err - } - - // If they claim it's a directory, then make sure. - // - // N.B. we reject directories below, but we must first reject - // non-directories passed as directories. - if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) { - return nil, nil, syserror.ENOTDIR - } - - if err := checkIsRegularFile(ctx, d, args.Filename); err != nil { - return nil, nil, err - } - - f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) - if err != nil { - return nil, nil, err - } - // Defer a DecRef for the sake of failure cases. - defer f.DecRef() - - if err := checkPread(ctx, f, args.Filename); err != nil { - return nil, nil, err - } - - d.IncRef() - f.IncRef() - return d, f, err -} - -// checkFile performs checks on a file to be executed. -func checkFile(ctx context.Context, f *fs.File, filename string) error { - if err := checkPermission(ctx, f.Dirent); err != nil { - return err - } - - if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil { - return err + return nil, syserror.ENOENT } - return checkPread(ctx, f, filename) -} - -// checkPermission checks whether the file is readable and executable. -func checkPermission(ctx context.Context, d *fs.Dirent) error { - perms := fs.PermMask{ - // TODO(gvisor.dev/issue/160): Linux requires only execute - // permission, not read. However, our backing filesystems may - // prevent us from reading the file without read permission. - // - // Additionally, a task with a non-readable executable has - // additional constraints on access via ptrace and procfs. - Read: true, - Execute: true, + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + opts := vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, } - return d.Inode.CheckPermission(ctx, perms) + return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal) } // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc. -func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error { - attr := d.Inode.StableAttr - if !fs.IsRegular(attr) { - ctx.Infof("%s is not regular: %v", filename, attr) - return syserror.EACCES +func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error { + t, err := file.Type(ctx) + if err != nil { + return err } - return nil -} - -// checkPread checks whether we can read the file at arbitrary offsets. -func checkPread(ctx context.Context, f *fs.File, filename string) error { - if !f.Flags().Pread { - ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags()) + if t != linux.ModeRegular { + ctx.Infof("%q is not a regular file: %v", filename, t) return syserror.EACCES } return nil @@ -224,8 +149,10 @@ const ( maxLoaderAttempts = 6 ) -// loadExecutable loads an executable that is pointed to by args.File. If nil, -// the path args.Filename is resolved and loaded. If the executable is an +// loadExecutable loads an executable that is pointed to by args.File. The +// caller is responsible for checking that the user can execute this file. +// If nil, the path args.Filename is resolved and loaded (check that the user +// can execute this file is done here in this case). If the executable is an // interpreter script rather than an ELF, the binary of the corresponding // interpreter will be loaded. // @@ -234,37 +161,27 @@ const ( // * arch.Context matching the binary arch // * fs.Dirent of the binary file // * Possibly updated args.Argv -func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) { +func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) { for i := 0; i < maxLoaderAttempts; i++ { - var ( - d *fs.Dirent - err error - ) if args.File == nil { - d, args.File, err = openPath(ctx, args) - // We will return d in the successful case, but defer a DecRef for the - // sake of intermediate loops and failure cases. - if d != nil { - defer d.DecRef() - } - if args.File != nil { - defer args.File.DecRef() + var err error + args.File, err = openPath(ctx, args) + if err != nil { + ctx.Infof("Error opening %s: %v", args.Filename, err) + return loadedELF{}, nil, nil, nil, err } + // Ensure file is release in case the code loops or errors out. + defer args.File.DecRef() } else { - d = args.File.Dirent - d.IncRef() - defer d.DecRef() - err = checkFile(ctx, args.File, args.Filename) - } - if err != nil { - ctx.Infof("Error opening %s: %v", args.Filename, err) - return loadedELF{}, nil, nil, nil, err + if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { + return loadedELF{}, nil, nil, nil, err + } } // Check the header. Is this an ELF or interpreter script? var hdr [4]uint8 // N.B. We assume that reading from a regular file cannot block. - _, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0) + _, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0) // Allow unexpected EOF, as a valid executable could be only three bytes // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { @@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context ctx.Infof("Error loading ELF: %v", err) return loadedELF{}, nil, nil, nil, err } - // An ELF is always terminal. Hold on to d. - d.IncRef() - return loaded, ac, d, args.Argv, err + // An ELF is always terminal. Hold on to file. + args.File.IncRef() + return loaded, ac, args.File, args.Argv, err + case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): if args.CloseOnExec { return loadedELF{}, nil, nil, nil, syserror.ENOENT @@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context } // Refresh the traversal limit for the interpreter. *args.RemainingTraversals = linux.MaxSymlinkTraversals + default: ctx.Infof("Unknown magic: %v", hdr) return loadedELF{}, nil, nil, nil, syserror.ENOEXEC @@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context // * Load is called on the Task goroutine. func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { // Load the executable itself. - loaded, ac, d, newArgv, err := loadExecutable(ctx, args) + loaded, ac, file, newArgv, err := loadExecutable(ctx, args) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer d.DecRef() + defer file.DecRef() // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) @@ -390,7 +309,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(d) + m.SetExecutable(file) ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index 52f446ed7..161b28c2c 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -69,6 +70,8 @@ type byteReader struct { var _ fs.FileOperations = (*byteReader)(nil) // newByteReaderFile creates a fake file to read data from. +// +// TODO(gvisor.dev/issue/1623): Convert to VFS2. func newByteReaderFile(ctx context.Context, data []byte) *fs.File { // Create a fake inode. inode := fs.NewInode( @@ -123,7 +126,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq // * PT_LOAD segments don't extend beyond the end of the file. // // ctx may be nil if f does not need it. -func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) { +func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) { info, err := parseHeader(ctx, f) if err != nil { log.Infof("Unable to parse VDSO header: %v", err) @@ -221,7 +224,7 @@ type VDSO struct { // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) { - vdsoFile := newByteReaderFile(ctx, vdsoBin) + vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin)) // First make sure the VDSO is valid. vdsoFile does not use ctx, so a // nil context can be passed. diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index e5729ced5..73591dab7 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -105,8 +105,8 @@ go_library( "//pkg/safecopy", "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/mm/README.md b/pkg/sentry/mm/README.md index e1322e373..f4d43d927 100644 --- a/pkg/sentry/mm/README.md +++ b/pkg/sentry/mm/README.md @@ -274,7 +274,7 @@ In the sentry: methods [`platform.AddressSpace.MapFile` and `platform.AddressSpace.Unmap`][platform]. -[memmap]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/memmap/memmap.go -[mm]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/mm/mm.go -[pgalloc]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/pgalloc/pgalloc.go -[platform]: https://github.com/google/gvisor/blob/master/+/master/pkg/sentry/platform/platform.go +[memmap]: https://github.com/google/gvisor/blob/master/pkg/sentry/memmap/memmap.go +[mm]: https://github.com/google/gvisor/blob/master/pkg/sentry/mm/mm.go +[pgalloc]: https://github.com/google/gvisor/blob/master/pkg/sentry/pgalloc/pgalloc.go +[platform]: https://github.com/google/gvisor/blob/master/pkg/sentry/platform/platform.go diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index e58a63deb..0332fc71c 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -18,7 +18,7 @@ import ( "fmt" "sync/atomic" - "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -39,11 +39,18 @@ func (mm *MemoryManager) AddressSpace() platform.AddressSpace { // // When this MemoryManager is no longer needed by a task, it should call // Deactivate to release the reference. -func (mm *MemoryManager) Activate() error { +func (mm *MemoryManager) Activate(ctx context.Context) error { // Fast path: the MemoryManager already has an active // platform.AddressSpace, and we just need to indicate that we need it too. - if atomicbitops.IncUnlessZeroInt32(&mm.active) { - return nil + for { + active := atomic.LoadInt32(&mm.active) + if active == 0 { + // Fall back to the slow path. + break + } + if atomic.CompareAndSwapInt32(&mm.active, active, active+1) { + return nil + } } for { @@ -85,16 +92,20 @@ func (mm *MemoryManager) Activate() error { if as == nil { // AddressSpace is unavailable, we must wait. // - // activeMu must not be held while waiting, as the user - // of the address space we are waiting on may attempt - // to take activeMu. - // - // Don't call UninterruptibleSleepStart to register the - // wait to allow the watchdog stuck task to trigger in - // case a process is starved waiting for the address - // space. + // activeMu must not be held while waiting, as the user of the address + // space we are waiting on may attempt to take activeMu. mm.activeMu.Unlock() + + sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation + if sleep { + // Mark this task sleeping while waiting for the address space to + // prevent the watchdog from reporting it as a stuck task. + ctx.UninterruptibleSleepStart(false) + } <-c + if sleep { + ctx.UninterruptibleSleepFinish(false) + } continue } @@ -118,8 +129,15 @@ func (mm *MemoryManager) Activate() error { func (mm *MemoryManager) Deactivate() { // Fast path: this is not the last goroutine to deactivate the // MemoryManager. - if atomicbitops.DecUnlessOneInt32(&mm.active) { - return + for { + active := atomic.LoadInt32(&mm.active) + if active == 1 { + // Fall back to the slow path. + break + } + if atomic.CompareAndSwapInt32(&mm.active, active, active-1) { + return + } } mm.activeMu.Lock() diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 47b8fbf43..d8a5b9d29 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -18,7 +18,6 @@ import ( "fmt" "sync/atomic" - "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -29,16 +28,17 @@ import ( ) // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. -func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager { +func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager { return &MemoryManager{ - p: p, - mfp: mfp, - haveASIO: p.SupportsAddressSpaceIO(), - privateRefs: &privateRefs{}, - users: 1, - auxv: arch.Auxv{}, - dumpability: UserDumpable, - aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + p: p, + mfp: mfp, + haveASIO: p.SupportsAddressSpaceIO(), + privateRefs: &privateRefs{}, + users: 1, + auxv: arch.Auxv{}, + dumpability: UserDumpable, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + sleepForActivation: sleepForActivation, } } @@ -80,9 +80,10 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { envv: mm.envv, auxv: append(arch.Auxv(nil), mm.auxv...), // IncRef'd below, once we know that there isn't an error. - executable: mm.executable, - dumpability: mm.dumpability, - aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + executable: mm.executable, + dumpability: mm.dumpability, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + sleepForActivation: mm.sleepForActivation, } // Copy vmas. @@ -229,7 +230,15 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { // IncUsers increments mm's user count and returns true. If the user count is // already 0, IncUsers does nothing and returns false. func (mm *MemoryManager) IncUsers() bool { - return atomicbitops.IncUnlessZeroInt32(&mm.users) + for { + users := atomic.LoadInt32(&mm.users) + if users == 0 { + return false + } + if atomic.CompareAndSwapInt32(&mm.users, users, users+1) { + return true + } + } } // DecUsers decrements mm's user count. If the user count reaches 0, all diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index f550acae0..6a49334f4 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -16,7 +16,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" ) @@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { // // An additional reference will be taken in the case of a non-nil executable, // which must be released by the caller. -func (mm *MemoryManager) Executable() *fs.Dirent { +func (mm *MemoryManager) Executable() fsbridge.File { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() @@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(d *fs.Dirent) { +func (mm *MemoryManager) SetExecutable(file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. - d.IncRef() + file.IncRef() // Set the executable. orig := mm.executable - mm.executable = d + mm.executable = file mm.metadataMu.Unlock() diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 09e582dd3..c2195ae11 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -37,7 +37,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" @@ -215,7 +215,7 @@ type MemoryManager struct { // is not nil, it holds a reference on the Dirent. // // executable is protected by metadataMu. - executable *fs.Dirent + executable fsbridge.File // dumpability describes if and how this MemoryManager may be dumped to // userspace. @@ -226,6 +226,11 @@ type MemoryManager struct { // aioManager keeps track of AIOContexts used for async IOs. AIOManager // must be cloned when CLONE_VM is used. aioManager aioManager + + // sleepForActivation indicates whether the task should report to be sleeping + // before trying to activate the address space. When set to true, delays in + // activation are not reported as stuck tasks by the watchdog. + sleepForActivation bool } // vma represents a virtual memory area. diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index edacca741..fdc308542 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -31,7 +31,7 @@ import ( func testMemoryManager(ctx context.Context) *MemoryManager { p := platform.FromContext(ctx) mfp := pgalloc.MemoryFileProviderFromContext(ctx) - mm := NewMemoryManager(p, mfp) + mm := NewMemoryManager(p, mfp, false) mm.layout = arch.MmapLayout{ MinAddr: p.MinUserAddress(), MaxAddr: p.MaxUserAddress(), diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 35cd55fef..4b23f7803 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -81,12 +81,6 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { // Save the death message, which will be thrown. c.dieState.message = msg - // Reload all registers to have an accurate stack trace when we return - // to host mode. This means that the stack should be unwound correctly. - if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 { - throw(msg) - } - // Setup the trampoline. dieArchSetup(c, context, &c.dieState.guestRegs) } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index a63a6a071..99cac665d 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -31,6 +31,12 @@ import ( // //go:nosplit func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { + // Reload all registers to have an accurate stack trace when we return + // to host mode. This means that the stack should be unwound correctly. + if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 { + throw(c.dieState.message) + } + // If the vCPU is in user mode, we set the stack to the stored stack // value in the vCPU itself. We don't want to unwind the user stack. if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet { diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s index c61700892..04efa0147 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.s +++ b/pkg/sentry/platform/kvm/bluepill_arm64.s @@ -82,6 +82,8 @@ fallback: // dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation. TEXT ·dieTrampoline(SB),NOSPLIT,$0 - // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64. - MOVD R9, 8(RSP) - BL ·dieHandler(SB) + // R0: Fake the old PC as caller + // R1: First argument (vCPU) + MOVD.P R1, 8(RSP) // R1: First argument (vCPU) + MOVD.P R0, 8(RSP) // R0: Fake the old PC as caller + B ·dieHandler(SB) diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index af093c6ec..4ca2b7717 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -20,6 +20,7 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) // fpsimdPtr returns a fpsimd64 for the given address. @@ -29,9 +30,29 @@ func fpsimdPtr(addr *byte) *arch.FpsimdContext { return (*arch.FpsimdContext)(unsafe.Pointer(addr)) } +// dieArchSetup initialies the state for dieTrampoline. +// +// The arm64 dieTrampoline requires the vCPU to be set in R1, and the last PC +// to be in R0. The trampoline then simulates a call to dieHandler from the +// provided PC. +// //go:nosplit func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { - // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64. + // If the vCPU is in user mode, we set the stack to the stored stack + // value in the vCPU itself. We don't want to unwind the user stack. + if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t { + regs := c.CPU.Registers() + context.Regs[0] = regs.Regs[0] + context.Sp = regs.Sp + context.Regs[29] = regs.Regs[29] // stack base address + } else { + context.Regs[0] = guestRegs.Regs.Pc + context.Sp = guestRegs.Regs.Sp + context.Regs[29] = guestRegs.Regs.Regs[29] + context.Pstate = guestRegs.Regs.Pstate + } + context.Regs[1] = uint64(uintptr(unsafe.Pointer(c))) + context.Pc = uint64(dieTrampolineAddr) } // bluepillArchFpContext returns the arch-specific fpsimd context. diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index 972ba85c3..a9b4af43e 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -27,6 +27,38 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// userMemoryRegion is a region of physical memory. +// +// This mirrors kvm_memory_region. +type userMemoryRegion struct { + slot uint32 + flags uint32 + guestPhysAddr uint64 + memorySize uint64 + userspaceAddr uint64 +} + +// runData is the run structure. This may be mapped for synchronous register +// access (although that doesn't appear to be supported by my kernel at least). +// +// This mirrors kvm_run. +type runData struct { + requestInterruptWindow uint8 + _ [7]uint8 + + exitReason uint32 + readyForInterruptInjection uint8 + ifFlag uint8 + _ [2]uint8 + + cr8 uint64 + apicBase uint64 + + // This is the union data for exits. Interpretation depends entirely on + // the exitReason above (see vCPU code for more information). + data [32]uint64 +} + // KVM represents a lightweight VM context. type KVM struct { platform.NoCPUPreemptionDetection diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go index c5a6f9c7d..093497bc4 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64.go +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -21,17 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0" ) -// userMemoryRegion is a region of physical memory. -// -// This mirrors kvm_memory_region. -type userMemoryRegion struct { - slot uint32 - flags uint32 - guestPhysAddr uint64 - memorySize uint64 - userspaceAddr uint64 -} - // userRegs represents KVM user registers. // // This mirrors kvm_regs. @@ -169,27 +158,6 @@ type modelControlRegisters struct { entries [16]modelControlRegister } -// runData is the run structure. This may be mapped for synchronous register -// access (although that doesn't appear to be supported by my kernel at least). -// -// This mirrors kvm_run. -type runData struct { - requestInterruptWindow uint8 - _ [7]uint8 - - exitReason uint32 - readyForInterruptInjection uint8 - ifFlag uint8 - _ [2]uint8 - - cr8 uint64 - apicBase uint64 - - // This is the union data for exits. Interpretation depends entirely on - // the exitReason above (see vCPU code for more information). - data [32]uint64 -} - // cpuidEntry is a single CPUID entry. // // This mirrors kvm_cpuid_entry2. diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index 2319c86d3..79045651e 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -20,17 +20,6 @@ import ( "syscall" ) -// userMemoryRegion is a region of physical memory. -// -// This mirrors kvm_memory_region. -type userMemoryRegion struct { - slot uint32 - flags uint32 - guestPhysAddr uint64 - memorySize uint64 - userspaceAddr uint64 -} - type kvmOneReg struct { id uint64 addr uint64 @@ -53,27 +42,6 @@ type userRegs struct { fpRegs userFpsimdState } -// runData is the run structure. This may be mapped for synchronous register -// access (although that doesn't appear to be supported by my kernel at least). -// -// This mirrors kvm_run. -type runData struct { - requestInterruptWindow uint8 - _ [7]uint8 - - exitReason uint32 - readyForInterruptInjection uint8 - ifFlag uint8 - _ [2]uint8 - - cr8 uint64 - apicBase uint64 - - // This is the union data for exits. Interpretation depends entirely on - // the exitReason above (see vCPU code for more information). - data [32]uint64 -} - // updateGlobalOnce does global initialization. It has to be called only once. func updateGlobalOnce(fd int) error { physicalInit() diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 8076c7529..f1afc74dc 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -329,10 +329,12 @@ func (m *machine) Destroy() { } // Get gets an available vCPU. +// +// This will return with the OS thread locked. func (m *machine) Get() *vCPU { + m.mu.RLock() runtime.LockOSThread() tid := procid.Current() - m.mu.RLock() // Check for an exact match. if c := m.vCPUs[tid]; c != nil { @@ -343,8 +345,22 @@ func (m *machine) Get() *vCPU { // The happy path failed. We now proceed to acquire an exclusive lock // (because the vCPU map may change), and scan all available vCPUs. + // In this case, we first unlock the OS thread. Otherwise, if mu is + // not available, the current system thread will be parked and a new + // system thread spawned. We avoid this situation by simply refreshing + // tid after relocking the system thread. m.mu.RUnlock() + runtime.UnlockOSThread() m.mu.Lock() + runtime.LockOSThread() + tid = procid.Current() + + // Recheck for an exact match. + if c := m.vCPUs[tid]; c != nil { + c.lock() + m.mu.Unlock() + return c + } for { // Scan for an available vCPU. diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index e99798c56..cd74945e7 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -21,6 +21,7 @@ import ( "strings" "syscall" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -183,13 +184,76 @@ func enableCpuidFault() { // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. // Ref attachedThread() for more detail. -func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { - return append(rules, seccomp.RuleSet{ - Rules: seccomp.SyscallRules{ - syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, +func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet { + rules = append(rules, + // Rules for trapping vsyscall access. + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_GETTIMEOFDAY: {}, + syscall.SYS_TIME: {}, + unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64. }, - }, - Action: linux.SECCOMP_RET_ALLOW, - }) + Action: linux.SECCOMP_RET_TRAP, + Vsyscall: true, + }) + if defaultAction != linux.SECCOMP_RET_ALLOW { + rules = append(rules, + seccomp.RuleSet{ + Rules: seccomp.SyscallRules{ + syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ + {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + }, + }, + Action: linux.SECCOMP_RET_ALLOW, + }) + } + return rules +} + +// probeSeccomp returns true iff seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. This check is dynamic +// because kernels have be backported behavior. +// +// See createStub for more information. +// +// Precondition: the runtime OS thread must be locked. +func probeSeccomp() bool { + // Create a completely new, destroyable process. + t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO) + if err != nil { + panic(fmt.Sprintf("seccomp probe failed: %v", err)) + } + defer t.destroy() + + // Set registers to the yield system call. This call is not allowed + // by the filters specified in the attachThread function. + regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) + if err := t.setRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace set regs failed: %v", err)) + } + + for { + // Attempt an emulation. + if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { + panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) + } + + sig := t.wait(stopped) + if sig == (syscallEvent | syscall.SIGTRAP) { + // Did the seccomp errno hook already run? This would + // indicate that seccomp is first in line and we're + // less than 4.8. + if err := t.getRegs(®s); err != nil { + panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) + } + if _, err := syscallReturnValue(®s); err == nil { + // The seccomp errno mode ran first, and reset + // the error in the registers. + return false + } + // The seccomp hook did not run yet, and therefore it + // is safe to use RET_KILL mode for dispatched calls. + return true + } + } } diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index 7b975137f..7f5c393f0 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -160,6 +160,15 @@ func enableCpuidFault() { // appendArchSeccompRules append architecture specific seccomp rules when creating BPF program. // Ref attachedThread() for more detail. -func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet { +func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFAction) []seccomp.RuleSet { return rules } + +// probeSeccomp returns true if seccomp is run after ptrace notifications, +// which is generally the case for kernel version >= 4.8. +// +// On arm64, the support of PTRACE_SYSEMU was added in the 5.3 kernel, so +// probeSeccomp can always return true. +func probeSeccomp() bool { + return true +} diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 74968dfdf..2ce528601 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -20,7 +20,6 @@ import ( "fmt" "syscall" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" @@ -30,54 +29,6 @@ import ( const syscallEvent syscall.Signal = 0x80 -// probeSeccomp returns true iff seccomp is run after ptrace notifications, -// which is generally the case for kernel version >= 4.8. This check is dynamic -// because kernels have be backported behavior. -// -// See createStub for more information. -// -// Precondition: the runtime OS thread must be locked. -func probeSeccomp() bool { - // Create a completely new, destroyable process. - t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO) - if err != nil { - panic(fmt.Sprintf("seccomp probe failed: %v", err)) - } - defer t.destroy() - - // Set registers to the yield system call. This call is not allowed - // by the filters specified in the attachThread function. - regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD) - if err := t.setRegs(®s); err != nil { - panic(fmt.Sprintf("ptrace set regs failed: %v", err)) - } - - for { - // Attempt an emulation. - if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 { - panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno)) - } - - sig := t.wait(stopped) - if sig == (syscallEvent | syscall.SIGTRAP) { - // Did the seccomp errno hook already run? This would - // indicate that seccomp is first in line and we're - // less than 4.8. - if err := t.getRegs(®s); err != nil { - panic(fmt.Sprintf("ptrace get-regs failed: %v", err)) - } - if _, err := syscallReturnValue(®s); err == nil { - // The seccomp errno mode ran first, and reset - // the error in the registers. - return false - } - // The seccomp hook did not run yet, and therefore it - // is safe to use RET_KILL mode for dispatched calls. - return true - } - } -} - // createStub creates a fresh stub processes. // // Precondition: the runtime OS thread must be locked. @@ -123,18 +74,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // stub and all its children. This is used to create child stubs // (below), so we must include the ability to fork, but otherwise lock // down available calls only to what is needed. - rules := []seccomp.RuleSet{ - // Rules for trapping vsyscall access. - { - Rules: seccomp.SyscallRules{ - syscall.SYS_GETTIMEOFDAY: {}, - syscall.SYS_TIME: {}, - unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64. - }, - Action: linux.SECCOMP_RET_TRAP, - Vsyscall: true, - }, - } + rules := []seccomp.RuleSet{} if defaultAction != linux.SECCOMP_RET_ALLOW { rules = append(rules, seccomp.RuleSet{ Rules: seccomp.SyscallRules{ @@ -173,9 +113,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro }, Action: linux.SECCOMP_RET_ALLOW, }) - - rules = appendArchSeccompRules(rules) } + rules = appendArchSeccompRules(rules, defaultAction) instrs, err := seccomp.BuildProgram(rules, defaultAction) if err != nil { return nil, err diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go index 6b078cd1e..8122ac6e2 100644 --- a/pkg/sentry/platform/ring0/aarch64.go +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -27,26 +27,27 @@ const ( _PTE_PGT_BASE = 0x7000 _PTE_PGT_SIZE = 0x1000 - _PSR_MODE_EL0t = 0x0 - _PSR_MODE_EL1t = 0x4 - _PSR_MODE_EL1h = 0x5 - _PSR_EL_MASK = 0xf - - _PSR_D_BIT = 0x200 - _PSR_A_BIT = 0x100 - _PSR_I_BIT = 0x80 - _PSR_F_BIT = 0x40 + _PSR_D_BIT = 0x00000200 + _PSR_A_BIT = 0x00000100 + _PSR_I_BIT = 0x00000080 + _PSR_F_BIT = 0x00000040 ) const ( + // PSR bits + PSR_MODE_EL0t = 0x00000000 + PSR_MODE_EL1t = 0x00000004 + PSR_MODE_EL1h = 0x00000005 + PSR_MODE_MASK = 0x0000000f + // KernelFlagsSet should always be set in the kernel. - KernelFlagsSet = _PSR_MODE_EL1h + KernelFlagsSet = PSR_MODE_EL1h // UserFlagsSet are always set in userspace. - UserFlagsSet = _PSR_MODE_EL0t + UserFlagsSet = PSR_MODE_EL0t - KernelFlagsClear = _PSR_EL_MASK - UserFlagsClear = _PSR_EL_MASK + KernelFlagsClear = PSR_MODE_MASK + UserFlagsClear = PSR_MODE_MASK PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT ) @@ -88,14 +89,14 @@ const ( El0Sync_undef El0Sync_dbg El0Sync_inv - VirtualizationException _NR_INTERRUPTS ) // System call vectors. const ( - Syscall Vector = El0Sync_svc - PageFault Vector = El0Sync_da + Syscall Vector = El0Sync_svc + PageFault Vector = El0Sync_da + VirtualizationException Vector = El0Error ) // VirtualAddressBits returns the number bits available for virtual addresses. diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index 679842288..d42eda37b 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -25,10 +25,14 @@ // not available for calls. // +// ERET returns using the ELR and SPSR for the current exception level. #define ERET() \ WORD $0xd69f03e0 +// RSV_REG is a register that holds el1 information temporarily. #define RSV_REG R18_PLATFORM + +// RSV_REG_APP is a register that holds el0 information temporarily. #define RSV_REG_APP R9 #define FPEN_NOTRAP 0x3 @@ -36,6 +40,12 @@ #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) +// Saves a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not saved: R9, R18. #define REGISTERS_SAVE(reg, offset) \ MOVD R0, offset+PTRACE_R0(reg); \ MOVD R1, offset+PTRACE_R1(reg); \ @@ -67,6 +77,12 @@ MOVD R29, offset+PTRACE_R29(reg); \ MOVD R30, offset+PTRACE_R30(reg); +// Loads a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not loaded: R9, R18. #define REGISTERS_LOAD(reg, offset) \ MOVD offset+PTRACE_R0(reg), R0; \ MOVD offset+PTRACE_R1(reg), R1; \ @@ -98,7 +114,7 @@ MOVD offset+PTRACE_R29(reg), R29; \ MOVD offset+PTRACE_R30(reg), R30; -//NOP +// NOP-s #define nop31Instructions() \ WORD $0xd503201f; \ WORD $0xd503201f; \ @@ -254,6 +270,7 @@ #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) +// LOAD_KERNEL_ADDRESS loads a kernel address. #define LOAD_KERNEL_ADDRESS(from, to) \ MOVD from, to; \ ORR $0xffff000000000000, to, to; @@ -263,15 +280,18 @@ LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \ MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \ MOVD RSV_REG, RSP; \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 ISB $15; \ DSB $15; +// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application. #define SWITCH_TO_APP_PAGETABLE(from) \ MOVD CPU_TTBR0_APP(from), RSV_REG; \ WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 ISB $15; \ DSB $15; +// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable. #define SWITCH_TO_KVM_PAGETABLE(from) \ MOVD CPU_TTBR0_KVM(from), RSV_REG; \ WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 @@ -294,6 +314,7 @@ WORD $0xd5181040; \ //MSR R0, CPACR_EL1 ISB $15; +// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1. #define KERNEL_ENTRY_FROM_EL0 \ SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack. STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ @@ -315,19 +336,22 @@ WORD $0xd5384103; \ // MRS SP_EL0, R3 MOVD R3, PTRACE_SP(RSV_REG_APP); +// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1. #define KERNEL_ENTRY_FROM_EL1 \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 - REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // save sentry context + REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // Save sentry context. MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \ WORD $0xd5384004; \ // MRS SPSR_EL1, R4 MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \ MRS ELR_EL1, R4; \ MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \ MOVD RSP, R4; \ - MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); + MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ + LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. +// Halt halts execution. TEXT ·Halt(SB),NOSPLIT,$0 - // clear bluepill. + // Clear bluepill. WORD $0xd538d092 //MRS TPIDR_EL1, R18 CMP RSV_REG, R9 BNE mmio_exit @@ -341,8 +365,22 @@ mmio_exit: // MMIO_EXIT. MOVD $0, R9 MOVD R0, 0xffff000000001000(R9) - B ·kernelExitToEl1(SB) + RET + +// HaltAndResume halts execution and point the pointer to the resume function. +TEXT ·HaltAndResume(SB),NOSPLIT,$0 + BL ·Halt(SB) + B ·kernelExitToEl1(SB) // Resume. +// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. +TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 + WORD $0xd538d092 // MRS TPIDR_EL1, R18 + MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. + MOVD R3, 8(RSP) // First argument (vCPU). + CALL ·kernelSyscall(SB) // Call the trampoline. + B ·kernelExitToEl1(SB) // Resume. + +// Shutdown stops the guest. TEXT ·Shutdown(SB),NOSPLIT,$0 // PSCI EVENT. MOVD $0x84000009, R0 @@ -429,6 +467,7 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 ERET() +// Start is the CPU entrypoint. TEXT ·Start(SB),NOSPLIT,$0 IRQ_DISABLE MOVD R8, RSV_REG @@ -437,18 +476,23 @@ TEXT ·Start(SB),NOSPLIT,$0 B ·kernelExitToEl1(SB) +// El1_sync_invalid is the handler for an invalid EL1_sync. TEXT ·El1_sync_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El1_irq_invalid is the handler for an invalid El1_irq. TEXT ·El1_irq_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El1_fiq_invalid is the handler for an invalid El1_fiq. TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El1_error_invalid is the handler for an invalid El1_error. TEXT ·El1_error_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El1_sync is the handler for El1_sync. TEXT ·El1_sync(SB),NOSPLIT,$0 KERNEL_ENTRY_FROM_EL1 WORD $0xd5385219 // MRS ESR_EL1, R25 @@ -484,10 +528,10 @@ el1_da: MOVD $PageFault, R3 MOVD R3, CPU_VECTOR_CODE(RSV_REG) - B ·Halt(SB) + B ·HaltAndResume(SB) el1_ia: - B ·Halt(SB) + B ·HaltAndResume(SB) el1_sp_pc: B ·Shutdown(SB) @@ -496,7 +540,9 @@ el1_undef: B ·Shutdown(SB) el1_svc: - B ·Halt(SB) + MOVD $0, CPU_ERROR_CODE(RSV_REG) + MOVD $0, CPU_ERROR_TYPE(RSV_REG) + B ·HaltEl1SvcAndResume(SB) el1_dbg: B ·Shutdown(SB) @@ -508,15 +554,19 @@ el1_fpsimd_acc: el1_invalid: B ·Shutdown(SB) +// El1_irq is the handler for El1_irq. TEXT ·El1_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El1_fiq is the handler for El1_fiq. TEXT ·El1_fiq(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El1_error is the handler for El1_error. TEXT ·El1_error(SB),NOSPLIT,$0 B ·Shutdown(SB) +// El0_sync is the handler for El0_sync. TEXT ·El0_sync(SB),NOSPLIT,$0 KERNEL_ENTRY_FROM_EL0 WORD $0xd5385219 // MRS ESR_EL1, R25 @@ -554,7 +604,7 @@ el0_svc: MOVD $Syscall, R3 MOVD R3, CPU_VECTOR_CODE(RSV_REG) - B ·Halt(SB) + B ·HaltAndResume(SB) el0_da: WORD $0xd538d092 //MRS TPIDR_EL1, R18 @@ -568,7 +618,7 @@ el0_da: MOVD $PageFault, R3 MOVD R3, CPU_VECTOR_CODE(RSV_REG) - B ·Halt(SB) + B ·HaltAndResume(SB) el0_ia: B ·Shutdown(SB) @@ -601,7 +651,19 @@ TEXT ·El0_fiq(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_error(SB),NOSPLIT,$0 - B ·Shutdown(SB) + KERNEL_ENTRY_FROM_EL0 + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + WORD $0xd538601a //MRS FAR_EL1, R26 + + MOVD R26, CPU_FAULT_ADDR(RSV_REG) + + MOVD $1, R3 + MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. + + MOVD $VirtualizationException, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + B ·HaltAndResume(SB) TEXT ·El0_sync_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) @@ -615,6 +677,7 @@ TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0 TEXT ·El0_error_invalid(SB),NOSPLIT,$0 B ·Shutdown(SB) +// Vectors implements exception vector table. TEXT ·Vectors(SB),NOSPLIT,$0 B ·El1_sync_invalid(SB) nop31Instructions() diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index c3d341998..ccacaea6b 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -16,6 +16,14 @@ package ring0 +// HaltAndResume halts execution and point the pointer to the resume function. +//go:nosplit +func HaltAndResume() + +// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. +//go:nosplit +func HaltEl1SvcAndResume() + // init initializes architecture-specific state. func (k *Kernel) init(opts KernelOpts) { // Save the root page tables. diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index 8c960c749..057fb5c69 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -85,6 +85,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) p := &syscall.PtraceRegs{} fmt.Fprintf(w, "\n// Ptrace registers.\n") diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 971eed7fa..581841555 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -7,7 +7,7 @@ go_template( name = "generic_walker", srcs = select_arch( amd64 = ["walker_amd64.go"], - arm64 = ["walker_amd64.go"], + arm64 = ["walker_arm64.go"], ), opt_types = [ "Visitor", @@ -80,7 +80,7 @@ go_library( "pagetables_amd64.go", "pagetables_arm64.go", "pagetables_x86.go", - "pcids_x86.go", + "pcids.go", "walker_amd64.go", "walker_arm64.go", "walker_empty.go", diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids.go index e199bae18..9206030bf 100644 --- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pcids.go @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build i386 amd64 - package pagetables import ( diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 79e16d6e8..4d42d29cb 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", "//pkg/syserror", + "//pkg/tcpip", "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 00265f15b..8834a1e1a 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/usermem" ) @@ -189,7 +190,7 @@ func putUint32(buf []byte, n uint32) []byte { // putCmsg writes a control message header and as much data as will fit into // the unused capacity of a buffer. func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) { - space := AlignDown(cap(buf)-len(buf), 4) + space := binary.AlignDown(cap(buf)-len(buf), 4) // We can't write to space that doesn't exist, so if we are going to align // the available space, we must align down. @@ -282,19 +283,9 @@ func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c) } -// AlignUp rounds a length up to an alignment. align must be a power of 2. -func AlignUp(length int, align uint) int { - return (length + int(align) - 1) & ^(int(align) - 1) -} - -// AlignDown rounds a down to an alignment. align must be a power of 2. -func AlignDown(length int, align uint) int { - return length & ^(int(align) - 1) -} - // alignSlice extends a slice's length (up to the capacity) to align it. func alignSlice(buf []byte, align uint) []byte { - aligned := AlignUp(len(buf), align) + aligned := binary.AlignUp(len(buf), align) if aligned > cap(buf) { // Linux allows unaligned data if there isn't room for alignment. // Since there isn't room for alignment, there isn't room for any @@ -338,7 +329,7 @@ func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte { } // PackTClass packs an IPV6_TCLASS socket control message. -func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte { +func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte { return putCmsgStruct( buf, linux.SOL_IPV6, @@ -348,6 +339,22 @@ func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte { ) } +// PackIPPacketInfo packs an IP_PKTINFO socket control message. +func PackIPPacketInfo(t *kernel.Task, packetInfo tcpip.IPPacketInfo, buf []byte) []byte { + var p linux.ControlMessageIPPacketInfo + p.NIC = int32(packetInfo.NIC) + copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr)) + copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr)) + + return putCmsgStruct( + buf, + linux.SOL_IP, + linux.IP_PKTINFO, + t.Arch().Width(), + p, + ) +} + // PackControlMessages packs control messages into the given buffer. // // We skip control messages specific to Unix domain sockets. @@ -372,12 +379,16 @@ func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byt buf = PackTClass(t, cmsgs.IP.TClass, buf) } + if cmsgs.IP.HasIPPacketInfo { + buf = PackIPPacketInfo(t, cmsgs.IP.PacketInfo, buf) + } + return buf } // cmsgSpace is equivalent to CMSG_SPACE in Linux. func cmsgSpace(t *kernel.Task, dataLen int) int { - return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width()) + return linux.SizeOfControlMessageHeader + binary.AlignUp(dataLen, t.Arch().Width()) } // CmsgsSpace returns the number of bytes needed to fit the control messages @@ -404,6 +415,16 @@ func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int { return space } +// NewIPPacketInfo returns the IPPacketInfo struct. +func NewIPPacketInfo(packetInfo linux.ControlMessageIPPacketInfo) tcpip.IPPacketInfo { + var p tcpip.IPPacketInfo + p.NIC = tcpip.NICID(packetInfo.NIC) + copy([]byte(p.LocalAddr), packetInfo.LocalAddr[:]) + copy([]byte(p.DestinationAddr), packetInfo.DestinationAddr[:]) + + return p +} + // Parse parses a raw socket control message into portable objects. func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) { var ( @@ -437,7 +458,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con case linux.SOL_SOCKET: switch h.Type { case linux.SCM_RIGHTS: - rightsSize := AlignDown(length, linux.SizeOfControlMessageRight) + rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight) numRights := rightsSize / linux.SizeOfControlMessageRight if len(fds)+numRights > linux.SCM_MAX_FD { @@ -448,7 +469,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight]))) } - i += AlignUp(length, width) + i += binary.AlignUp(length, width) case linux.SCM_CREDENTIALS: if length < linux.SizeOfControlMessageCredentials { @@ -462,7 +483,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con return socket.ControlMessages{}, err } cmsgs.Unix.Credentials = scmCreds - i += AlignUp(length, width) + i += binary.AlignUp(length, width) default: // Unknown message type. @@ -476,7 +497,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con } cmsgs.IP.HasTOS = true binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS) - i += AlignUp(length, width) + i += binary.AlignUp(length, width) + + case linux.IP_PKTINFO: + if length < linux.SizeOfControlMessageIPPacketInfo { + return socket.ControlMessages{}, syserror.EINVAL + } + + cmsgs.IP.HasIPPacketInfo = true + var packetInfo linux.ControlMessageIPPacketInfo + binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo) + + cmsgs.IP.PacketInfo = NewIPPacketInfo(packetInfo) + i += binary.AlignUp(length, width) default: return socket.ControlMessages{}, syserror.EINVAL @@ -489,7 +522,7 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con } cmsgs.IP.HasTClass = true binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass) - i += AlignUp(length, width) + i += binary.AlignUp(length, width) default: return socket.ControlMessages{}, syserror.EINVAL diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 5a07d5d0e..023bad156 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -10,6 +10,7 @@ go_library( "save_restore.go", "socket.go", "socket_unsafe.go", + "sockopt_impl.go", "stack.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index bde4c7a1e..22f78d2e2 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -126,7 +126,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } return uint64(n), nil } - return readv(s.fd, iovecsFromBlockSeq(dsts)) + return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) })) return int64(n), err } @@ -149,7 +149,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } return uint64(n), nil } - return writev(s.fd, iovecsFromBlockSeq(srcs)) + return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) })) return int64(n), err } @@ -285,11 +285,11 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt } // Whitelist options and constrain option length. - var optlen int + optlen := getSockOptLen(t, level, name) switch level { case linux.SOL_IP: switch name { - case linux.IP_TOS, linux.IP_RECVTOS: + case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO: optlen = sizeofInt32 } case linux.SOL_IPV6: @@ -330,12 +330,14 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt // SetSockOpt implements socket.Socket.SetSockOpt. func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { // Whitelist options and constrain option length. - var optlen int + optlen := setSockOptLen(t, level, name) switch level { case linux.SOL_IP: switch name { case linux.IP_TOS, linux.IP_RECVTOS: optlen = sizeofInt32 + case linux.IP_PKTINFO: + optlen = linux.SizeOfControlMessageIPPacketInfo } case linux.SOL_IPV6: switch name { @@ -353,6 +355,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ optlen = sizeofInt32 } } + if optlen == 0 { // Pretend to accept socket options we don't understand. This seems // dangerous, but it's what netstack does... @@ -402,7 +405,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // We always do a non-blocking recv*(). sysflags := flags | syscall.MSG_DONTWAIT - iovs := iovecsFromBlockSeq(dsts) + iovs := safemem.IovecsFromBlockSeq(dsts) msg := syscall.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), @@ -472,7 +475,14 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags case syscall.IP_TOS: controlMessages.IP.HasTOS = true binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS) + + case syscall.IP_PKTINFO: + controlMessages.IP.HasIPPacketInfo = true + var packetInfo linux.ControlMessageIPPacketInfo + binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageIPPacketInfo], usermem.ByteOrder, &packetInfo) + controlMessages.IP.PacketInfo = control.NewIPPacketInfo(packetInfo) } + case syscall.SOL_IPV6: switch unixCmsg.Header.Type { case syscall.IPV6_TCLASS: @@ -522,7 +532,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return uint64(n), nil } - iovs := iovecsFromBlockSeq(srcs) + iovs := safemem.IovecsFromBlockSeq(srcs) msg := syscall.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), @@ -567,21 +577,6 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return int(n), syserr.FromError(err) } -func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec { - iovs := make([]syscall.Iovec, 0, bs.NumBlocks()) - for ; !bs.IsEmpty(); bs = bs.Tail() { - b := bs.Head() - iovs = append(iovs, syscall.Iovec{ - Base: &b.ToSlice()[0], - Len: uint64(b.Len()), - }) - // We don't need to care about b.NeedSafecopy(), because the host - // kernel will handle such address ranges just fine (by returning - // EFAULT). - } - return iovs -} - func translateIOSyscallError(err error) error { if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK { return syserror.ErrWouldBlock diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/socket/hostinet/sockopt_impl.go index 4d54b8b8f..8a783712e 100644 --- a/pkg/sentry/kernel/pipe/buffer_test.go +++ b/pkg/sentry/socket/hostinet/sockopt_impl.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,21 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -package pipe +package hostinet import ( - "testing" - "unsafe" - - "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/sentry/kernel" ) -func TestBufferSize(t *testing.T) { - bufferSize := unsafe.Sizeof(buffer{}) - if bufferSize < usermem.PageSize { - t.Errorf("buffer is less than a page") - } - if bufferSize > (2 * usermem.PageSize) { - t.Errorf("buffer is greater than two pages") - } +func getSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. +} + +func setSockOptLen(t *kernel.Task, level, name int) int { + return 0 // No custom options. } diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 034eca676..a48082631 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -310,6 +310,11 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { return addrs } +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + return syserror.EACCES +} + // SupportsIPv6 implements inet.Stack.SupportsIPv6. func (s *Stack) SupportsIPv6() bool { return s.supportsIPv6 diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index fa2a2cb66..7cd2ce55b 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -5,7 +5,11 @@ package(licenses = ["notice"]) go_library( name = "netfilter", srcs = [ + "extensions.go", "netfilter.go", + "targets.go", + "tcp_matcher.go", + "udp_matcher.go", ], # This target depends on netstack and should only be used by epsocket, # which is allowed to depend on netstack. @@ -17,6 +21,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/syserr", "//pkg/tcpip", + "//pkg/tcpip/header", "//pkg/tcpip/iptables", "//pkg/tcpip/stack", "//pkg/usermem", diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go new file mode 100644 index 000000000..b4b244abf --- /dev/null +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -0,0 +1,95 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/usermem" +) + +// TODO(gvisor.dev/issue/170): The following per-matcher params should be +// supported: +// - Table name +// - Match size +// - User size +// - Hooks +// - Proto +// - Family + +// matchMaker knows how to (un)marshal the matcher named name(). +type matchMaker interface { + // name is the matcher name as stored in the xt_entry_match struct. + name() string + + // marshal converts from an iptables.Matcher to an ABI struct. + marshal(matcher iptables.Matcher) []byte + + // unmarshal converts from the ABI matcher struct to an + // iptables.Matcher. + unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) +} + +// matchMakers maps the name of supported matchers to the matchMaker that +// marshals and unmarshals it. It is immutable after package initialization. +var matchMakers = map[string]matchMaker{} + +// registermatchMaker should be called by match extensions to register them +// with the netfilter package. +func registerMatchMaker(mm matchMaker) { + if _, ok := matchMakers[mm.name()]; ok { + panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name())) + } + matchMakers[mm.name()] = mm +} + +func marshalMatcher(matcher iptables.Matcher) []byte { + matchMaker, ok := matchMakers[matcher.Name()] + if !ok { + panic(fmt.Sprintf("Unknown matcher of type %T.", matcher)) + } + return matchMaker.marshal(matcher) +} + +// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and +// data appended at the end. +func marshalEntryMatch(name string, data []byte) []byte { + nflog("marshaling matcher %q", name) + + // We have to pad this struct size to a multiple of 8 bytes. + size := binary.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8) + matcher := linux.KernelXTEntryMatch{ + XTEntryMatch: linux.XTEntryMatch{ + MatchSize: uint16(size), + }, + Data: data, + } + copy(matcher.Name[:], name) + + buf := make([]byte, 0, size) + buf = binary.Marshal(buf, usermem.ByteOrder, matcher) + return append(buf, make([]byte, size-len(buf))...) +} + +func unmarshalMatcher(match linux.XTEntryMatch, filter iptables.IPHeaderFilter, buf []byte) (iptables.Matcher, error) { + matchMaker, ok := matchMakers[match.Name.String()] + if !ok { + return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) + } + return matchMaker.unmarshal(buf, filter) +} diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 6ef740463..2ec11f6ac 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,6 +17,7 @@ package netfilter import ( + "errors" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -34,9 +35,12 @@ import ( // shouldn't be reached - an error has occurred if we fall through to one. const errorTargetName = "ERROR" -// metadata is opaque to netstack. It holds data that we need to translate -// between Linux's and netstack's iptables representations. -// TODO(gvisor.dev/issue/170): This might be removable. +// Metadata is used to verify that we are correctly serializing and +// deserializing iptables into structs consumable by the iptables tool. We save +// a metadata struct when the tables are written, and when they are read out we +// verify that certain fields are the same. +// +// metadata is used by this serialization/deserializing code, not netstack. type metadata struct { HookEntry [linux.NF_INET_NUMHOOKS]uint32 Underflow [linux.NF_INET_NUMHOOKS]uint32 @@ -44,6 +48,13 @@ type metadata struct { Size uint32 } +// nflog logs messages related to the writing and reading of iptables. +func nflog(format string, args ...interface{}) { + if log.IsLogging(log.Debug) { + log.Debugf("netfilter: "+format, args...) + } +} + // GetInfo returns information about iptables. func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { // Read in the struct and table name. @@ -55,7 +66,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT // Find the appropriate table. table, err := findTable(stack, info.Name) if err != nil { - return linux.IPTGetinfo{}, err + nflog("%v", err) + return linux.IPTGetinfo{}, syserr.ErrInvalidArgument } // Get the hooks that apply to this table. @@ -72,6 +84,8 @@ func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPT info.NumEntries = metadata.NumEntries info.Size = metadata.Size + nflog("returning info: %+v", info) + return info, nil } @@ -80,34 +94,40 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen // Read in the struct and table name. var userEntries linux.IPTGetEntries if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + nflog("couldn't copy in entries %q", userEntries.Name) return linux.KernelIPTGetEntries{}, syserr.FromError(err) } // Find the appropriate table. table, err := findTable(stack, userEntries.Name) if err != nil { - return linux.KernelIPTGetEntries{}, err + nflog("%v", err) + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } // Convert netstack's iptables rules to something that the iptables // tool can understand. - entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table) + entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table) if err != nil { - return linux.KernelIPTGetEntries{}, err + nflog("couldn't read entries: %v", err) + return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument + } + if meta != table.Metadata().(metadata) { + panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta)) } if binary.Size(entries) > uintptr(outLen) { - log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen)) + nflog("insufficient GetEntries output size: %d", uintptr(outLen)) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } return entries, nil } -func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, *syserr.Error) { +func findTable(stack *stack.Stack, tablename linux.TableName) (iptables.Table, error) { ipt := stack.IPTables() table, ok := ipt.Tables[tablename.String()] if !ok { - return iptables.Table{}, syserr.ErrInvalidArgument + return iptables.Table{}, fmt.Errorf("couldn't find table %q", tablename) } return table, nil } @@ -135,28 +155,31 @@ func FillDefaultIPTables(stack *stack.Stack) { // format expected by the iptables tool. Linux stores each table as a binary // blob that can only be traversed by parsing a bit, reading some offsets, // jumping to those offsets, parsing again, etc. -func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) { +func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, error) { // Return values. var entries linux.KernelIPTGetEntries var meta metadata // The table name has to fit in the struct. if linux.XT_TABLE_MAXNAMELEN < len(tablename) { - log.Warningf("Table name %q too long.", tablename) - return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument + return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("table name %q too long.", tablename) } copy(entries.Name[:], tablename) for ruleIdx, rule := range table.Rules { + nflog("convert to binary: current offset: %d", entries.Size) + // Is this a chain entry point? for hook, hookRuleIdx := range table.BuiltinChains { if hookRuleIdx == ruleIdx { + nflog("convert to binary: found hook %d at offset %d", hook, entries.Size) meta.HookEntry[hook] = entries.Size } } // Is this a chain underflow point? for underflow, underflowRuleIdx := range table.Underflows { if underflowRuleIdx == ruleIdx { + nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size) meta.Underflow[underflow] = entries.Size } } @@ -176,6 +199,10 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern // Serialize the matcher and add it to the // entry. serialized := marshalMatcher(matcher) + nflog("convert to binary: matcher serialized as: %v", serialized) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) + } entry.Elems = append(entry.Elems, serialized...) entry.NextOffset += uint16(len(serialized)) entry.TargetOffset += uint16(len(serialized)) @@ -183,41 +210,46 @@ func convertNetstackToBinary(tablename string, table iptables.Table) (linux.Kern // Serialize and append the target. serialized := marshalTarget(rule.Target) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) + } entry.Elems = append(entry.Elems, serialized...) entry.NextOffset += uint16(len(serialized)) + nflog("convert to binary: adding entry: %+v", entry) + entries.Size += uint32(entry.NextOffset) entries.Entrytable = append(entries.Entrytable, entry) meta.NumEntries++ } + nflog("convert to binary: finished with an marshalled size of %d", meta.Size) meta.Size = entries.Size return entries, meta, nil } -func marshalMatcher(matcher iptables.Matcher) []byte { - switch matcher.(type) { - default: - // TODO(gvisor.dev/issue/170): We don't support any matchers - // yet, so any call to marshalMatcher will panic. - panic(fmt.Errorf("unknown matcher of type %T", matcher)) - } -} - func marshalTarget(target iptables.Target) []byte { - switch target.(type) { - case iptables.UnconditionalAcceptTarget: - return marshalStandardTarget(iptables.Accept) - case iptables.UnconditionalDropTarget: - return marshalStandardTarget(iptables.Drop) + switch tg := target.(type) { + case iptables.AcceptTarget: + return marshalStandardTarget(iptables.RuleAccept) + case iptables.DropTarget: + return marshalStandardTarget(iptables.RuleDrop) case iptables.ErrorTarget: - return marshalErrorTarget() + return marshalErrorTarget(errorTargetName) + case iptables.UserChainTarget: + return marshalErrorTarget(tg.Name) + case iptables.ReturnTarget: + return marshalStandardTarget(iptables.RuleReturn) + case JumpTarget: + return marshalJumpTarget(tg) default: panic(fmt.Errorf("unknown target of type %T", target)) } } -func marshalStandardTarget(verdict iptables.Verdict) []byte { +func marshalStandardTarget(verdict iptables.RuleVerdict) []byte { + nflog("convert to binary: marshalling standard target") + // The target's name will be the empty string. target := linux.XTStandardTarget{ Target: linux.XTEntryTarget{ @@ -230,66 +262,77 @@ func marshalStandardTarget(verdict iptables.Verdict) []byte { return binary.Marshal(ret, usermem.ByteOrder, target) } -func marshalErrorTarget() []byte { +func marshalErrorTarget(errorName string) []byte { // This is an error target named error target := linux.XTErrorTarget{ Target: linux.XTEntryTarget{ TargetSize: linux.SizeOfXTErrorTarget, }, } - copy(target.Name[:], errorTargetName) + copy(target.Name[:], errorName) copy(target.Target.Name[:], errorTargetName) ret := make([]byte, 0, linux.SizeOfXTErrorTarget) return binary.Marshal(ret, usermem.ByteOrder, target) } +func marshalJumpTarget(jt JumpTarget) []byte { + nflog("convert to binary: marshalling jump target") + + // The target's name will be the empty string. + target := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + // Verdict is overloaded by the ABI. When positive, it holds + // the jump offset from the start of the table. + Verdict: int32(jt.Offset), + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, target) +} + // translateFromStandardVerdict translates verdicts the same way as the iptables // tool. -func translateFromStandardVerdict(verdict iptables.Verdict) int32 { +func translateFromStandardVerdict(verdict iptables.RuleVerdict) int32 { switch verdict { - case iptables.Accept: + case iptables.RuleAccept: return -linux.NF_ACCEPT - 1 - case iptables.Drop: + case iptables.RuleDrop: return -linux.NF_DROP - 1 - case iptables.Queue: - return -linux.NF_QUEUE - 1 - case iptables.Return: + case iptables.RuleReturn: return linux.NF_RETURN - case iptables.Jump: + default: // TODO(gvisor.dev/issue/170): Support Jump. - panic("Jump isn't supported yet") + panic(fmt.Sprintf("unknown standard verdict: %d", verdict)) } - panic(fmt.Sprintf("unknown standard verdict: %d", verdict)) } -// translateToStandardVerdict translates from the value in a +// translateToStandardTarget translates from the value in a // linux.XTStandardTarget to an iptables.Verdict. -func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) { +func translateToStandardTarget(val int32) (iptables.Target, error) { // TODO(gvisor.dev/issue/170): Support other verdicts. switch val { case -linux.NF_ACCEPT - 1: - return iptables.Accept, nil + return iptables.AcceptTarget{}, nil case -linux.NF_DROP - 1: - return iptables.Drop, nil + return iptables.DropTarget{}, nil case -linux.NF_QUEUE - 1: - log.Warningf("Unsupported iptables verdict QUEUE.") + return nil, errors.New("unsupported iptables verdict QUEUE") case linux.NF_RETURN: - log.Warningf("Unsupported iptables verdict RETURN.") + return iptables.ReturnTarget{}, nil default: - log.Warningf("Unknown iptables verdict %d.", val) + return nil, fmt.Errorf("unknown iptables verdict %d", val) } - return iptables.Invalid, syserr.ErrInvalidArgument } // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { - printReplace(optVal) - // Get the basic rules data (struct ipt_replace). if len(optVal) < linux.SizeOfIPTReplace { - log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal)) + nflog("optVal has insufficient size for replace %d", len(optVal)) return syserr.ErrInvalidArgument } var replace linux.IPTReplace @@ -303,25 +346,32 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { case iptables.TablenameFilter: table = iptables.EmptyFilterTable() default: - log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) + nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) return syserr.ErrInvalidArgument } + nflog("set entries: setting entries in table %q", replace.Name.String()) + // Convert input into a list of rules and their offsets. var offset uint32 - var offsets []uint32 + // offsets maps rule byte offsets to their position in table.Rules. + offsets := map[uint32]int{} for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { + nflog("set entries: processing entry at offset %d", offset) + // Get the struct ipt_entry. if len(optVal) < linux.SizeOfIPTEntry { - log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal)) + nflog("optVal has insufficient size for entry %d", len(optVal)) return syserr.ErrInvalidArgument } var entry linux.IPTEntry buf := optVal[:linux.SizeOfIPTEntry] - optVal = optVal[linux.SizeOfIPTEntry:] binary.Unmarshal(buf, usermem.ByteOrder, &entry) - if entry.TargetOffset != linux.SizeOfIPTEntry { - // TODO(gvisor.dev/issue/170): Support matchers. + initialOptValLen := len(optVal) + optVal = optVal[linux.SizeOfIPTEntry:] + + if entry.TargetOffset < linux.SizeOfIPTEntry { + nflog("entry has too-small target offset %d", entry.TargetOffset) return syserr.ErrInvalidArgument } @@ -329,22 +379,50 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { // filtering fields. filter, err := filterFromIPTIP(entry.IP) if err != nil { - return err + nflog("bad iptip: %v", err) + return syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): Matchers and targets can specify + // that they only work for certain protocols, hooks, tables. + // Get matchers. + matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry + if len(optVal) < int(matchersSize) { + nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + return syserr.ErrInvalidArgument + } + matchers, err := parseMatchers(filter, optVal[:matchersSize]) + if err != nil { + nflog("failed to parse matchers: %v", err) + return syserr.ErrInvalidArgument } + optVal = optVal[matchersSize:] // Get the target of the rule. - target, consumed, err := parseTarget(optVal) + targetSize := entry.NextOffset - entry.TargetOffset + if len(optVal) < int(targetSize) { + nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + return syserr.ErrInvalidArgument + } + target, err := parseTarget(optVal[:targetSize]) if err != nil { - return err + nflog("failed to parse target: %v", err) + return syserr.ErrInvalidArgument } - optVal = optVal[consumed:] + optVal = optVal[targetSize:] table.Rules = append(table.Rules, iptables.Rule{ - Filter: filter, - Target: target, + Filter: filter, + Target: target, + Matchers: matchers, }) - offsets = append(offsets, offset) - offset += linux.SizeOfIPTEntry + consumed + offsets[offset] = int(entryIdx) + offset += uint32(entry.NextOffset) + + if initialOptValLen-len(optVal) != int(entry.NextOffset) { + nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + return syserr.ErrInvalidArgument + } } // Go through the list of supported hooks for this table and, for each @@ -352,32 +430,77 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { for hook, _ := range replace.HookEntry { if table.ValidHooks()&(1<<hook) != 0 { hk := hookFromLinux(hook) - for ruleIdx, offset := range offsets { + for offset, ruleIdx := range offsets { if offset == replace.HookEntry[hook] { table.BuiltinChains[hk] = ruleIdx } if offset == replace.Underflow[hook] { + if !validUnderflow(table.Rules[ruleIdx]) { + nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP") + return syserr.ErrInvalidArgument + } table.Underflows[hk] = ruleIdx } } if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset { - log.Warningf("Hook %v is unset.", hk) + nflog("hook %v is unset.", hk) return syserr.ErrInvalidArgument } if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset { - log.Warningf("Underflow %v is unset.", hk) + nflog("underflow %v is unset.", hk) return syserr.ErrInvalidArgument } } } + // Add the user chains. + for ruleIdx, rule := range table.Rules { + target, ok := rule.Target.(iptables.UserChainTarget) + if !ok { + continue + } + + // We found a user chain. Before inserting it into the table, + // check that: + // - There's some other rule after it. + // - There are no matchers. + if ruleIdx == len(table.Rules)-1 { + nflog("user chain must have a rule or default policy") + return syserr.ErrInvalidArgument + } + if len(table.Rules[ruleIdx].Matchers) != 0 { + nflog("user chain's first node must have no matchers") + return syserr.ErrInvalidArgument + } + table.UserChains[target.Name] = ruleIdx + 1 + } + + // Set each jump to point to the appropriate rule. Right now they hold byte + // offsets. + for ruleIdx, rule := range table.Rules { + jump, ok := rule.Target.(JumpTarget) + if !ok { + continue + } + + // Find the rule corresponding to the jump rule offset. + jumpTo, ok := offsets[jump.Offset] + if !ok { + nflog("failed to find a rule to jump to") + return syserr.ErrInvalidArgument + } + jump.RuleNum = jumpTo + rule.Target = jump + table.Rules[ruleIdx] = rule + } + // TODO(gvisor.dev/issue/170): Support other chains. // Since we only support modifying the INPUT chain right now, make sure // all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { if hook != iptables.Input { - if _, ok := table.Rules[ruleIdx].Target.(iptables.UnconditionalAcceptTarget); !ok { - log.Warningf("Hook %d is unsupported.", hook) + if _, ok := table.Rules[ruleIdx].Target.(iptables.AcceptTarget); !ok { + nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument } } @@ -401,12 +524,56 @@ func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error { return nil } -// parseTarget parses a target from the start of optVal and returns the target -// along with the number of bytes it occupies in optVal. -func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { +// parseMatchers parses 0 or more matchers from optVal. optVal should contain +// only the matchers. +func parseMatchers(filter iptables.IPHeaderFilter, optVal []byte) ([]iptables.Matcher, error) { + nflog("set entries: parsing matchers of size %d", len(optVal)) + var matchers []iptables.Matcher + for len(optVal) > 0 { + nflog("set entries: optVal has len %d", len(optVal)) + + // Get the XTEntryMatch. + if len(optVal) < linux.SizeOfXTEntryMatch { + return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal)) + } + var match linux.XTEntryMatch + buf := optVal[:linux.SizeOfXTEntryMatch] + binary.Unmarshal(buf, usermem.ByteOrder, &match) + nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match) + + // Check some invariants. + if match.MatchSize < linux.SizeOfXTEntryMatch { + + return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) + } + if len(optVal) < int(match.MatchSize) { + return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal)) + } + + // Parse the specific matcher. + matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) + if err != nil { + return nil, fmt.Errorf("failed to create matcher: %v", err) + } + matchers = append(matchers, matcher) + + // TODO(gvisor.dev/issue/170): Check the revision field. + optVal = optVal[match.MatchSize:] + } + + if len(optVal) != 0 { + return nil, errors.New("optVal should be exhausted after parsing matchers") + } + + return matchers, nil +} + +// parseTarget parses a target from optVal. optVal should contain only the +// target. +func parseTarget(optVal []byte) (iptables.Target, error) { + nflog("set entries: parsing target of size %d", len(optVal)) if len(optVal) < linux.SizeOfXTEntryTarget { - log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal)) - return nil, 0, syserr.ErrInvalidArgument + return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal)) } var target linux.XTEntryTarget buf := optVal[:linux.SizeOfXTEntryTarget] @@ -414,32 +581,24 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { switch target.Name.String() { case "": // Standard target. - if len(optVal) < linux.SizeOfXTStandardTarget { - log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal)) - return nil, 0, syserr.ErrInvalidArgument + if len(optVal) != linux.SizeOfXTStandardTarget { + return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal)) } var standardTarget linux.XTStandardTarget buf = optVal[:linux.SizeOfXTStandardTarget] binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget) - verdict, err := translateToStandardVerdict(standardTarget.Verdict) - if err != nil { - return nil, 0, err - } - switch verdict { - case iptables.Accept: - return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil - case iptables.Drop: - return iptables.UnconditionalDropTarget{}, linux.SizeOfXTStandardTarget, nil - default: - panic(fmt.Sprintf("Unknown verdict: %v", verdict)) + if standardTarget.Verdict < 0 { + // A Verdict < 0 indicates a non-jump verdict. + return translateToStandardTarget(standardTarget.Verdict) } + // A verdict >= 0 indicates a jump. + return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil case errorTargetName: // Error target. - if len(optVal) < linux.SizeOfXTErrorTarget { - log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal)) - return nil, 0, syserr.ErrInvalidArgument + if len(optVal) != linux.SizeOfXTErrorTarget { + return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal)) } var errorTarget linux.XTErrorTarget buf = optVal[:linux.SizeOfXTErrorTarget] @@ -452,24 +611,24 @@ func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) { // somehow fall through every rule. // * To mark the start of a user defined chain. These // rules have an error with the name of the chain. - switch errorTarget.Name.String() { + switch name := errorTarget.Name.String(); name { case errorTargetName: - return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil + nflog("set entries: error target") + return iptables.ErrorTarget{}, nil default: - log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String()) - return nil, 0, syserr.ErrInvalidArgument + // User defined chain. + nflog("set entries: user-defined target %q", name) + return iptables.UserChainTarget{Name: name}, nil } } // Unknown target. - log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) - return nil, 0, syserr.ErrInvalidArgument + return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) } -func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, *syserr.Error) { +func filterFromIPTIP(iptip linux.IPTIP) (iptables.IPHeaderFilter, error) { if containsUnsupportedFields(iptip) { - log.Warningf("netfilter: unsupported fields in struct iptip: %+v", iptip) - return iptables.IPHeaderFilter{}, syserr.ErrInvalidArgument + return iptables.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) } return iptables.IPHeaderFilter{ Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), @@ -492,6 +651,18 @@ func containsUnsupportedFields(iptip linux.IPTIP) bool { iptip.InverseFlags != 0 } +func validUnderflow(rule iptables.Rule) bool { + if len(rule.Matchers) != 0 { + return false + } + switch rule.Target.(type) { + case iptables.AcceptTarget, iptables.DropTarget: + return true + default: + return false + } +} + func hookFromLinux(hook int) iptables.Hook { switch hook { case linux.NF_INET_PRE_ROUTING: @@ -507,52 +678,3 @@ func hookFromLinux(hook int) iptables.Hook { } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } - -// printReplace prints information about the struct ipt_replace in optVal. It -// is only for debugging. -func printReplace(optVal []byte) { - // Basic replace info. - var replace linux.IPTReplace - replaceBuf := optVal[:linux.SizeOfIPTReplace] - optVal = optVal[linux.SizeOfIPTReplace:] - binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace) - log.Infof("Replacing table %q: %+v", replace.Name.String(), replace) - - // Read in the list of entries at the end of replace. - var totalOffset uint16 - for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { - var entry linux.IPTEntry - entryBuf := optVal[:linux.SizeOfIPTEntry] - binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry) - log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry) - - totalOffset += entry.NextOffset - if entry.TargetOffset == linux.SizeOfIPTEntry { - log.Infof("Entry has no matches.") - } else { - log.Infof("Entry has matches.") - } - - var target linux.XTEntryTarget - targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget] - binary.Unmarshal(targetBuf, usermem.ByteOrder, &target) - log.Infof("Target named %q: %+v", target.Name.String(), target) - - switch target.Name.String() { - case "": - var standardTarget linux.XTStandardTarget - stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget] - binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget) - log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict) - case errorTargetName: - var errorTarget linux.XTErrorTarget - etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget] - binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget) - log.Infof("Error target with name %q.", errorTarget.Name.String()) - default: - log.Infof("Unknown target type.") - } - - optVal = optVal[entry.NextOffset:] - } -} diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go new file mode 100644 index 000000000..c421b87cf --- /dev/null +++ b/pkg/sentry/socket/netfilter/targets.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/iptables" +) + +// JumpTarget implements iptables.Target. +type JumpTarget struct { + // Offset is the byte offset of the rule to jump to. It is used for + // marshaling and unmarshaling. + Offset uint32 + + // RuleNum is the rule to jump to. + RuleNum int +} + +// Action implements iptables.Target.Action. +func (jt JumpTarget) Action(tcpip.PacketBuffer) (iptables.RuleVerdict, int) { + return iptables.RuleJump, jt.RuleNum +} diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go new file mode 100644 index 000000000..f9945e214 --- /dev/null +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameTCP = "tcp" + +func init() { + registerMatchMaker(tcpMarshaler{}) +} + +// tcpMarshaler implements matchMaker for TCP matching. +type tcpMarshaler struct{} + +// name implements matchMaker.name. +func (tcpMarshaler) name() string { + return matcherNameTCP +} + +// marshal implements matchMaker.marshal. +func (tcpMarshaler) marshal(mr iptables.Matcher) []byte { + matcher := mr.(*TCPMatcher) + xttcp := linux.XTTCP{ + SourcePortStart: matcher.sourcePortStart, + SourcePortEnd: matcher.sourcePortEnd, + DestinationPortStart: matcher.destinationPortStart, + DestinationPortEnd: matcher.destinationPortEnd, + } + buf := make([]byte, 0, linux.SizeOfXTTCP) + return marshalEntryMatch(matcherNameTCP, binary.Marshal(buf, usermem.ByteOrder, xttcp)) +} + +// unmarshal implements matchMaker.unmarshal. +func (tcpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { + if len(buf) < linux.SizeOfXTTCP { + return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may + // exceed what's strictly necessary to hold matchData. + var matchData linux.XTTCP + binary.Unmarshal(buf[:linux.SizeOfXTTCP], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed XTTCP: %+v", matchData) + + if matchData.Option != 0 || + matchData.FlagMask != 0 || + matchData.FlagCompare != 0 || + matchData.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported TCP matcher flags set") + } + + if filter.Protocol != header.TCPProtocolNumber { + return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber) + } + + return &TCPMatcher{ + sourcePortStart: matchData.SourcePortStart, + sourcePortEnd: matchData.SourcePortEnd, + destinationPortStart: matchData.DestinationPortStart, + destinationPortEnd: matchData.DestinationPortEnd, + }, nil +} + +// TCPMatcher matches TCP packets and their headers. It implements Matcher. +type TCPMatcher struct { + sourcePortStart uint16 + sourcePortEnd uint16 + destinationPortStart uint16 + destinationPortEnd uint16 +} + +// Name implements Matcher.Name. +func (*TCPMatcher) Name() string { + return matcherNameTCP +} + +// Match implements Matcher.Match. +func (tm *TCPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } + + // Now we need the transport header. However, this may not have been set + // yet. + // TODO(gvisor.dev/issue/170): Parsing the transport header should + // ultimately be moved into the iptables.Check codepath as matchers are + // added. + var tcpHeader header.TCP + if pkt.TransportHeader != nil { + tcpHeader = header.TCP(pkt.TransportHeader) + } else { + // The TCP header hasn't been parsed yet. We have to do it here. + if len(pkt.Data.First()) < header.TCPMinimumSize { + // There's no valid TCP header here, so we hotdrop the + // packet. + return false, true + } + tcpHeader = header.TCP(pkt.Data.First()) + } + + // Check whether the source and destination ports are within the + // matching range. + if sourcePort := tcpHeader.SourcePort(); sourcePort < tm.sourcePortStart || tm.sourcePortEnd < sourcePort { + return false, false + } + if destinationPort := tcpHeader.DestinationPort(); destinationPort < tm.destinationPortStart || tm.destinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go new file mode 100644 index 000000000..86aa11696 --- /dev/null +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -0,0 +1,142 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/iptables" + "gvisor.dev/gvisor/pkg/usermem" +) + +const matcherNameUDP = "udp" + +func init() { + registerMatchMaker(udpMarshaler{}) +} + +// udpMarshaler implements matchMaker for UDP matching. +type udpMarshaler struct{} + +// name implements matchMaker.name. +func (udpMarshaler) name() string { + return matcherNameUDP +} + +// marshal implements matchMaker.marshal. +func (udpMarshaler) marshal(mr iptables.Matcher) []byte { + matcher := mr.(*UDPMatcher) + xtudp := linux.XTUDP{ + SourcePortStart: matcher.sourcePortStart, + SourcePortEnd: matcher.sourcePortEnd, + DestinationPortStart: matcher.destinationPortStart, + DestinationPortEnd: matcher.destinationPortEnd, + } + buf := make([]byte, 0, linux.SizeOfXTUDP) + return marshalEntryMatch(matcherNameUDP, binary.Marshal(buf, usermem.ByteOrder, xtudp)) +} + +// unmarshal implements matchMaker.unmarshal. +func (udpMarshaler) unmarshal(buf []byte, filter iptables.IPHeaderFilter) (iptables.Matcher, error) { + if len(buf) < linux.SizeOfXTUDP { + return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) + } + + // For alignment reasons, the match's total size may exceed what's + // strictly necessary to hold matchData. + var matchData linux.XTUDP + binary.Unmarshal(buf[:linux.SizeOfXTUDP], usermem.ByteOrder, &matchData) + nflog("parseMatchers: parsed XTUDP: %+v", matchData) + + if matchData.InverseFlags != 0 { + return nil, fmt.Errorf("unsupported UDP matcher inverse flags set") + } + + if filter.Protocol != header.UDPProtocolNumber { + return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) + } + + return &UDPMatcher{ + sourcePortStart: matchData.SourcePortStart, + sourcePortEnd: matchData.SourcePortEnd, + destinationPortStart: matchData.DestinationPortStart, + destinationPortEnd: matchData.DestinationPortEnd, + }, nil +} + +// UDPMatcher matches UDP packets and their headers. It implements Matcher. +type UDPMatcher struct { + sourcePortStart uint16 + sourcePortEnd uint16 + destinationPortStart uint16 + destinationPortEnd uint16 +} + +// Name implements Matcher.Name. +func (*UDPMatcher) Name() string { + return matcherNameUDP +} + +// Match implements Matcher.Match. +func (um *UDPMatcher) Match(hook iptables.Hook, pkt tcpip.PacketBuffer, interfaceName string) (bool, bool) { + netHeader := header.IPv4(pkt.NetworkHeader) + + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the iptables.Check codepath as matchers are added. + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + return false, false + } + + // We dont't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { + return false, true + } + return false, false + } + + // Now we need the transport header. However, this may not have been set + // yet. + // TODO(gvisor.dev/issue/170): Parsing the transport header should + // ultimately be moved into the iptables.Check codepath as matchers are + // added. + var udpHeader header.UDP + if pkt.TransportHeader != nil { + udpHeader = header.UDP(pkt.TransportHeader) + } else { + // The UDP header hasn't been parsed yet. We have to do it here. + if len(pkt.Data.First()) < header.UDPMinimumSize { + // There's no valid UDP header here, so we hotdrop the + // packet. + return false, true + } + udpHeader = header.UDP(pkt.Data.First()) + } + + // Check whether the source and destination ports are within the + // matching range. + if sourcePort := udpHeader.SourcePort(); sourcePort < um.sourcePortStart || um.sourcePortEnd < sourcePort { + return false, false + } + if destinationPort := udpHeader.DestinationPort(); destinationPort < um.destinationPortStart || um.destinationPortEnd < destinationPort { + return false, false + } + + return true, false +} diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index f8b8e467d..1911cd9b8 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -33,3 +33,15 @@ go_library( "//pkg/waiter", ], ) + +go_test( + name = "netlink_test", + size = "small", + srcs = [ + "message_test.go", + ], + deps = [ + ":netlink", + "//pkg/abi/linux", + ], +) diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go index b21e0ca4b..0899c61d1 100644 --- a/pkg/sentry/socket/netlink/message.go +++ b/pkg/sentry/socket/netlink/message.go @@ -23,15 +23,16 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// alignUp rounds a length up to an alignment. +// alignPad returns the length of padding required for alignment. // // Preconditions: align is a power of two. -func alignUp(length int, align uint) int { - return (length + int(align) - 1) &^ (int(align) - 1) +func alignPad(length int, align uint) int { + return binary.AlignUp(length, align) - length } // Message contains a complete serialized netlink message. type Message struct { + hdr linux.NetlinkMessageHeader buf []byte } @@ -40,10 +41,86 @@ type Message struct { // The header length will be updated by Finalize. func NewMessage(hdr linux.NetlinkMessageHeader) *Message { return &Message{ + hdr: hdr, buf: binary.Marshal(nil, usermem.ByteOrder, hdr), } } +// ParseMessage parses the first message seen at buf, returning the rest of the +// buffer. If message is malformed, ok of false is returned. For last message, +// padding check is loose, if there isn't enought padding, whole buf is consumed +// and ok is set to true. +func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) { + b := BytesView(buf) + + hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize) + if !ok { + return + } + var hdr linux.NetlinkMessageHeader + binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr) + + // Msg portion. + totalMsgLen := int(hdr.Length) + _, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize) + if !ok { + return + } + + // Padding. + numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO) + // Linux permits the last message not being aligned, just consume all of it. + // Ref: net/netlink/af_netlink.c:netlink_rcv_skb + if numPad > len(b) { + numPad = len(b) + } + _, ok = b.Extract(numPad) + if !ok { + return + } + + return &Message{ + hdr: hdr, + buf: buf[:totalMsgLen], + }, []byte(b), true +} + +// Header returns the header of this message. +func (m *Message) Header() linux.NetlinkMessageHeader { + return m.hdr +} + +// GetData unmarshals the payload message header from this netlink message, and +// returns the attributes portion. +func (m *Message) GetData(msg interface{}) (AttrsView, bool) { + b := BytesView(m.buf) + + _, ok := b.Extract(linux.NetlinkMessageHeaderSize) + if !ok { + return nil, false + } + + size := int(binary.Size(msg)) + msgBytes, ok := b.Extract(size) + if !ok { + return nil, false + } + binary.Unmarshal(msgBytes, usermem.ByteOrder, msg) + + numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO) + // Linux permits the last message not being aligned, just consume all of it. + // Ref: net/netlink/af_netlink.c:netlink_rcv_skb + if numPad > len(b) { + numPad = len(b) + } + _, ok = b.Extract(numPad) + if !ok { + return nil, false + } + + return AttrsView(b), true +} + // Finalize returns the []byte containing the entire message, with the total // length set in the message header. The Message must not be modified after // calling Finalize. @@ -54,7 +131,7 @@ func (m *Message) Finalize() []byte { // Align the message. Note that the message length in the header (set // above) is the useful length of the message, not the total aligned // length. See net/netlink/af_netlink.c:__nlmsg_put. - aligned := alignUp(len(m.buf), linux.NLMSG_ALIGNTO) + aligned := binary.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO) m.putZeros(aligned - len(m.buf)) return m.buf } @@ -89,7 +166,7 @@ func (m *Message) PutAttr(atype uint16, v interface{}) { m.Put(v) // Align the attribute. - aligned := alignUp(l, linux.NLA_ALIGNTO) + aligned := binary.AlignUp(l, linux.NLA_ALIGNTO) m.putZeros(aligned - l) } @@ -106,7 +183,7 @@ func (m *Message) PutAttrString(atype uint16, s string) { m.putZeros(1) // Align the attribute. - aligned := alignUp(l, linux.NLA_ALIGNTO) + aligned := binary.AlignUp(l, linux.NLA_ALIGNTO) m.putZeros(aligned - l) } @@ -157,3 +234,48 @@ func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message { ms.Messages = append(ms.Messages, m) return m } + +// AttrsView is a view into the attributes portion of a netlink message. +type AttrsView []byte + +// Empty returns whether there is no attribute left in v. +func (v AttrsView) Empty() bool { + return len(v) == 0 +} + +// ParseFirst parses first netlink attribute at the beginning of v. +func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) { + b := BytesView(v) + + hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize) + if !ok { + return + } + binary.Unmarshal(hdrBytes, usermem.ByteOrder, &hdr) + + value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize) + if !ok { + return + } + + _, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO)) + if !ok { + return + } + + return hdr, value, AttrsView(b), ok +} + +// BytesView supports extracting data from a byte slice with bounds checking. +type BytesView []byte + +// Extract removes the first n bytes from v and returns it. If n is out of +// bounds, it returns false. +func (v *BytesView) Extract(n int) ([]byte, bool) { + if n < 0 || n > len(*v) { + return nil, false + } + extracted := (*v)[:n] + *v = (*v)[n:] + return extracted, true +} diff --git a/pkg/sentry/socket/netlink/message_test.go b/pkg/sentry/socket/netlink/message_test.go new file mode 100644 index 000000000..ef13d9386 --- /dev/null +++ b/pkg/sentry/socket/netlink/message_test.go @@ -0,0 +1,312 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package message_test + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/socket/netlink" +) + +type dummyNetlinkMsg struct { + Foo uint16 +} + +func TestParseMessage(t *testing.T) { + tests := []struct { + desc string + input []byte + + header linux.NetlinkMessageHeader + dataMsg *dummyNetlinkMsg + restLen int + ok bool + }{ + { + desc: "valid", + input: []byte{ + 0x14, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + header: linux.NetlinkMessageHeader{ + Length: 20, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "valid with next message", + input: []byte{ + 0x14, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + 0xFF, // Next message (rest) + }, + header: linux.NetlinkMessageHeader{ + Length: 20, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 1, + ok: true, + }, + { + desc: "valid for last message without padding", + input: []byte{ + 0x12, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, // Data message + }, + header: linux.NetlinkMessageHeader{ + Length: 18, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "valid for last message not to be aligned", + input: []byte{ + 0x13, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, // Data message + 0x00, // Excessive 1 byte permitted at end + }, + header: linux.NetlinkMessageHeader{ + Length: 19, + Type: 1, + Flags: 2, + Seq: 3, + PortID: 4, + }, + dataMsg: &dummyNetlinkMsg{ + Foo: 0x3130, + }, + restLen: 0, + ok: true, + }, + { + desc: "header.Length too short", + input: []byte{ + 0x04, 0x00, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + ok: false, + }, + { + desc: "header.Length too long", + input: []byte{ + 0xFF, 0xFF, 0x00, 0x00, // Length + 0x01, 0x00, // Type + 0x02, 0x00, // Flags + 0x03, 0x00, 0x00, 0x00, // Seq + 0x04, 0x00, 0x00, 0x00, // PortID + 0x30, 0x31, 0x00, 0x00, // Data message with 2 bytes padding + }, + ok: false, + }, + { + desc: "header incomplete", + input: []byte{ + 0x04, 0x00, 0x00, 0x00, // Length + }, + ok: false, + }, + { + desc: "empty message", + input: []byte{}, + ok: false, + }, + } + for _, test := range tests { + msg, rest, ok := netlink.ParseMessage(test.input) + if ok != test.ok { + t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok) + continue + } + if !test.ok { + continue + } + if !reflect.DeepEqual(msg.Header(), test.header) { + t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, msg.Header(), test.header) + } + + dataMsg := &dummyNetlinkMsg{} + _, dataOk := msg.GetData(dataMsg) + if !dataOk { + t.Errorf("%v: GetData.ok = %v, want = true", test.desc, dataOk) + } else if !reflect.DeepEqual(dataMsg, test.dataMsg) { + t.Errorf("%v: GetData.msg = %+v, want = %+v", test.desc, dataMsg, test.dataMsg) + } + + if got, want := rest, test.input[len(test.input)-test.restLen:]; !bytes.Equal(got, want) { + t.Errorf("%v: got rest = %v, want = %v", test.desc, got, want) + } + } +} + +func TestAttrView(t *testing.T) { + tests := []struct { + desc string + input []byte + + // Outputs for ParseFirst. + hdr linux.NetlinkAttrHeader + value []byte + restLen int + ok bool + + // Outputs for Empty. + isEmpty bool + }{ + { + desc: "valid", + input: []byte{ + 0x06, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x00, 0x00, // Data with 2 bytes padding + }, + hdr: linux.NetlinkAttrHeader{ + Length: 6, + Type: 1, + }, + value: []byte{0x30, 0x31}, + restLen: 0, + ok: true, + isEmpty: false, + }, + { + desc: "at alignment", + input: []byte{ + 0x08, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + hdr: linux.NetlinkAttrHeader{ + Length: 8, + Type: 1, + }, + value: []byte{0x30, 0x31, 0x32, 0x33}, + restLen: 0, + ok: true, + isEmpty: false, + }, + { + desc: "at alignment with rest data", + input: []byte{ + 0x08, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + 0xFF, 0xFE, // Rest data + }, + hdr: linux.NetlinkAttrHeader{ + Length: 8, + Type: 1, + }, + value: []byte{0x30, 0x31, 0x32, 0x33}, + restLen: 2, + ok: true, + isEmpty: false, + }, + { + desc: "hdr.Length too long", + input: []byte{ + 0xFF, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + ok: false, + isEmpty: false, + }, + { + desc: "hdr.Length too short", + input: []byte{ + 0x01, 0x00, // Length + 0x01, 0x00, // Type + 0x30, 0x31, 0x32, 0x33, // Data + }, + ok: false, + isEmpty: false, + }, + { + desc: "empty", + input: []byte{}, + ok: false, + isEmpty: true, + }, + } + for _, test := range tests { + attrs := netlink.AttrsView(test.input) + + // Test ParseFirst(). + hdr, value, rest, ok := attrs.ParseFirst() + if ok != test.ok { + t.Errorf("%v: got ok = %v, want = %v", test.desc, ok, test.ok) + } else if test.ok { + if !reflect.DeepEqual(hdr, test.hdr) { + t.Errorf("%v: got hdr = %+v, want = %+v", test.desc, hdr, test.hdr) + } + if !bytes.Equal(value, test.value) { + t.Errorf("%v: got value = %v, want = %v", test.desc, value, test.value) + } + if wantRest := test.input[len(test.input)-test.restLen:]; !bytes.Equal(rest, wantRest) { + t.Errorf("%v: got rest = %v, want = %v", test.desc, rest, wantRest) + } + } + + // Test Empty(). + if got, want := attrs.Empty(), test.isEmpty; got != want { + t.Errorf("%v: got empty = %v, want = %v", test.desc, got, want) + } + } +} diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 07f860a49..b0dc70e5c 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -42,7 +42,7 @@ type Protocol interface { // If err == nil, any messages added to ms will be sent back to the // other end of the socket. Setting ms.Multi will cause an NLMSG_DONE // message to be sent even if ms contains no messages. - ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *MessageSet) *syserr.Error + ProcessMessage(ctx context.Context, msg *Message, ms *MessageSet) *syserr.Error } // Provider is a function that creates a new Protocol for a specific netlink diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 622a1eafc..93127398d 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -10,13 +10,11 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/socket/netlink", "//pkg/syserr", - "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index 2b3c7f5b3..c84d8bd7c 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -17,16 +17,15 @@ package route import ( "bytes" + "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/syserr" - "gvisor.dev/gvisor/pkg/usermem" ) // commandKind describes the operational class of a message type. @@ -69,13 +68,7 @@ func (p *Protocol) CanSend() bool { } // dumpLinks handles RTM_GETLINK dump requests. -func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { - // TODO(b/68878065): Only the dump variant of the types below are - // supported. - if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP { - return syserr.ErrNotSupported - } - +func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an // ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some // userspace applications (including glibc) still include rtgenmsg. @@ -99,44 +92,105 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader return nil } - for id, i := range stack.Interfaces() { - m := ms.AddMessage(linux.NetlinkMessageHeader{ - Type: linux.RTM_NEWLINK, - }) + for idx, i := range stack.Interfaces() { + addNewLinkMessage(ms, idx, i) + } - m.Put(linux.InterfaceInfoMessage{ - Family: linux.AF_UNSPEC, - Type: i.DeviceType, - Index: id, - Flags: i.Flags, - }) + return nil +} - m.PutAttrString(linux.IFLA_IFNAME, i.Name) - m.PutAttr(linux.IFLA_MTU, i.MTU) +// getLinks handles RTM_GETLINK requests. +func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network devices. + return nil + } - mac := make([]byte, 6) - brd := mac - if len(i.Addr) > 0 { - mac = i.Addr - brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) + // Parse message. + var ifi linux.InterfaceInfoMessage + attrs, ok := msg.GetData(&ifi) + if !ok { + return syserr.ErrInvalidArgument + } + + // Parse attributes. + var byName []byte + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument } - m.PutAttr(linux.IFLA_ADDRESS, mac) - m.PutAttr(linux.IFLA_BROADCAST, brd) + attrs = rest - // TODO(gvisor.dev/issue/578): There are many more attributes. + switch ahdr.Type { + case linux.IFLA_IFNAME: + if len(value) < 1 { + return syserr.ErrInvalidArgument + } + byName = value[:len(value)-1] + + // TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK. + } } + found := false + for idx, i := range stack.Interfaces() { + switch { + case ifi.Index > 0: + if idx != ifi.Index { + continue + } + case byName != nil: + if string(byName) != i.Name { + continue + } + default: + // Criteria not specified. + return syserr.ErrInvalidArgument + } + + addNewLinkMessage(ms, idx, i) + found = true + break + } + if !found { + return syserr.ErrNoDevice + } return nil } -// dumpAddrs handles RTM_GETADDR dump requests. -func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { - // TODO(b/68878065): Only the dump variant of the types below are - // supported. - if hdr.Flags&linux.NLM_F_DUMP != linux.NLM_F_DUMP { - return syserr.ErrNotSupported +// addNewLinkMessage appends RTM_NEWLINK message for the given interface into +// the message set. +func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.RTM_NEWLINK, + }) + + m.Put(linux.InterfaceInfoMessage{ + Family: linux.AF_UNSPEC, + Type: i.DeviceType, + Index: idx, + Flags: i.Flags, + }) + + m.PutAttrString(linux.IFLA_IFNAME, i.Name) + m.PutAttr(linux.IFLA_MTU, i.MTU) + + mac := make([]byte, 6) + brd := mac + if len(i.Addr) > 0 { + mac = i.Addr + brd = bytes.Repeat([]byte{0xff}, len(i.Addr)) } + m.PutAttr(linux.IFLA_ADDRESS, mac) + m.PutAttr(linux.IFLA_BROADCAST, brd) + + // TODO(gvisor.dev/issue/578): There are many more attributes. +} +// dumpAddrs handles RTM_GETADDR dump requests. +func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // RTM_GETADDR dump requests need not contain anything more than the // netlink header and 1 byte protocol family common to all // NETLINK_ROUTE requests. @@ -168,6 +222,7 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader Index: uint32(id), }) + m.PutAttr(linux.IFA_LOCAL, []byte(a.Addr)) m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr)) // TODO(gvisor.dev/issue/578): There are many more attributes. @@ -252,12 +307,12 @@ func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) { } // parseForDestination parses a message as format of RouteMessage-RtAttr-dst. -func parseForDestination(data []byte) ([]byte, *syserr.Error) { +func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) { var rtMsg linux.RouteMessage - if len(data) < linux.SizeOfRouteMessage { + attrs, ok := msg.GetData(&rtMsg) + if !ok { return nil, syserr.ErrInvalidArgument } - binary.Unmarshal(data[:linux.SizeOfRouteMessage], usermem.ByteOrder, &rtMsg) // iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See // commit bc234301af12. Note we don't check this flag for backward // compatibility. @@ -265,26 +320,15 @@ func parseForDestination(data []byte) ([]byte, *syserr.Error) { return nil, syserr.ErrNotSupported } - data = data[linux.SizeOfRouteMessage:] - - // TODO(gvisor.dev/issue/1611): Add generic attribute parsing. - var rtAttr linux.RtAttr - if len(data) < linux.SizeOfRtAttr { - return nil, syserr.ErrInvalidArgument + // Expect first attribute is RTA_DST. + if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST { + return value, nil } - binary.Unmarshal(data[:linux.SizeOfRtAttr], usermem.ByteOrder, &rtAttr) - if rtAttr.Type != linux.RTA_DST { - return nil, syserr.ErrInvalidArgument - } - - if len(data) < int(rtAttr.Len) { - return nil, syserr.ErrInvalidArgument - } - return data[linux.SizeOfRtAttr:rtAttr.Len], nil + return nil, syserr.ErrInvalidArgument } // dumpRoutes handles RTM_GETROUTE requests. -func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { +func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // RTM_GETROUTE dump requests need not contain anything more than the // netlink header and 1 byte protocol family common to all // NETLINK_ROUTE requests. @@ -295,10 +339,11 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade return nil } + hdr := msg.Header() routeTables := stack.RouteTable() if hdr.Flags == linux.NLM_F_REQUEST { - dst, err := parseForDestination(data) + dst, err := parseForDestination(msg) if err != nil { return err } @@ -357,10 +402,55 @@ func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeade return nil } +// newAddr handles RTM_NEWADDR requests. +func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network stack. + return syserr.ErrProtocolNotSupported + } + + var ifa linux.InterfaceAddrMessage + attrs, ok := msg.GetData(&ifa) + if !ok { + return syserr.ErrInvalidArgument + } + + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument + } + attrs = rest + + switch ahdr.Type { + case linux.IFA_LOCAL: + err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ + Family: ifa.Family, + PrefixLen: ifa.PrefixLen, + Flags: ifa.Flags, + Addr: value, + }) + if err == syscall.EEXIST { + flags := msg.Header().Flags + if flags&linux.NLM_F_EXCL != 0 { + return syserr.ErrExists + } + } else if err != nil { + return syserr.ErrInvalidArgument + } + } + } + return nil +} + // ProcessMessage implements netlink.Protocol.ProcessMessage. -func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { +func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + hdr := msg.Header() + // All messages start with a 1 byte protocol family. - if len(data) < 1 { + var family uint8 + if _, ok := msg.GetData(&family); !ok { // Linux ignores messages missing the protocol family. See // net/core/rtnetlink.c:rtnetlink_rcv_msg. return nil @@ -374,16 +464,32 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH } } - switch hdr.Type { - case linux.RTM_GETLINK: - return p.dumpLinks(ctx, hdr, data, ms) - case linux.RTM_GETADDR: - return p.dumpAddrs(ctx, hdr, data, ms) - case linux.RTM_GETROUTE: - return p.dumpRoutes(ctx, hdr, data, ms) - default: - return syserr.ErrNotSupported + if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP { + // TODO(b/68878065): Only the dump variant of the types below are + // supported. + switch hdr.Type { + case linux.RTM_GETLINK: + return p.dumpLinks(ctx, msg, ms) + case linux.RTM_GETADDR: + return p.dumpAddrs(ctx, msg, ms) + case linux.RTM_GETROUTE: + return p.dumpRoutes(ctx, msg, ms) + default: + return syserr.ErrNotSupported + } + } else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST { + switch hdr.Type { + case linux.RTM_GETLINK: + return p.getLink(ctx, msg, ms) + case linux.RTM_GETROUTE: + return p.dumpRoutes(ctx, msg, ms) + case linux.RTM_NEWADDR: + return p.newAddr(ctx, msg, ms) + default: + return syserr.ErrNotSupported + } } + return syserr.ErrNotSupported } // init registers the NETLINK_ROUTE provider. diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index c4b95debb..2ca02567d 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -644,47 +644,38 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error return nil } -func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error { +func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) { m := ms.AddMessage(linux.NetlinkMessageHeader{ Type: linux.NLMSG_ERROR, }) - m.Put(linux.NetlinkErrorMessage{ Error: int32(-err.ToLinux().Number()), Header: hdr, }) - return nil +} +func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) { + m := ms.AddMessage(linux.NetlinkMessageHeader{ + Type: linux.NLMSG_ERROR, + }) + m.Put(linux.NetlinkErrorMessage{ + Error: 0, + Header: hdr, + }) } // processMessages handles each message in buf, passing it to the protocol // handler for final handling. func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { for len(buf) > 0 { - if len(buf) < linux.NetlinkMessageHeaderSize { + msg, rest, ok := ParseMessage(buf) + if !ok { // Linux ignores messages that are too short. See // net/netlink/af_netlink.c:netlink_rcv_skb. break } - - var hdr linux.NetlinkMessageHeader - binary.Unmarshal(buf[:linux.NetlinkMessageHeaderSize], usermem.ByteOrder, &hdr) - - if hdr.Length < linux.NetlinkMessageHeaderSize || uint64(hdr.Length) > uint64(len(buf)) { - // Linux ignores malformed messages. See - // net/netlink/af_netlink.c:netlink_rcv_skb. - break - } - - // Data from this message. - data := buf[linux.NetlinkMessageHeaderSize:hdr.Length] - - // Advance to the next message. - next := alignUp(int(hdr.Length), linux.NLMSG_ALIGNTO) - if next >= len(buf)-1 { - next = len(buf) - 1 - } - buf = buf[next:] + buf = rest + hdr := msg.Header() // Ignore control messages. if hdr.Type < linux.NLMSG_MIN_TYPE { @@ -692,19 +683,10 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error } ms := NewMessageSet(s.portID, hdr.Seq) - var err *syserr.Error - // TODO(b/68877377): ACKs not supported yet. - if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { - err = syserr.ErrNotSupported - } else { - - err = s.protocol.ProcessMessage(ctx, hdr, data, ms) - } - if err != nil { - ms = NewMessageSet(s.portID, hdr.Seq) - if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil { - return err - } + if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil { + dumpErrorMesage(hdr, ms, err) + } else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { + dumpAckMesage(hdr, ms) } if err := s.sendResponse(ctx, ms); err != nil { diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go index 1ee4296bc..029ba21b5 100644 --- a/pkg/sentry/socket/netlink/uevent/protocol.go +++ b/pkg/sentry/socket/netlink/uevent/protocol.go @@ -49,7 +49,7 @@ func (p *Protocol) CanSend() bool { } // ProcessMessage implements netlink.Protocol.ProcessMessage. -func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error { +func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { // Silently ignore all messages. return nil } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 8619cc506..13a9a60b4 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -712,14 +712,44 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { - addr, family, err := AddressAndFamily(sockaddr) - if err != nil { - return err + if len(sockaddr) < 2 { + return syserr.ErrInvalidArgument } - if err := s.checkFamily(family, true /* exact */); err != nil { - return err + + family := usermem.ByteOrder.Uint16(sockaddr) + var addr tcpip.FullAddress + + // Bind for AF_PACKET requires only family, protocol and ifindex. + // In function AddressAndFamily, we check the address length which is + // not needed for AF_PACKET bind. + if family == linux.AF_PACKET { + var a linux.SockAddrLink + if len(sockaddr) < sockAddrLinkSize { + return syserr.ErrInvalidArgument + } + binary.Unmarshal(sockaddr[:sockAddrLinkSize], usermem.ByteOrder, &a) + + if a.Protocol != uint16(s.protocol) { + return syserr.ErrInvalidArgument + } + + addr = tcpip.FullAddress{ + NIC: tcpip.NICID(a.InterfaceIndex), + Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), + } + } else { + var err *syserr.Error + addr, family, err = AddressAndFamily(sockaddr) + if err != nil { + return err + } + + if err = s.checkFamily(family, true /* exact */); err != nil { + return err + } + + addr = s.mapFamily(addr, family) } - addr = s.mapFamily(addr, family) // Issue the bind request to the endpoint. return syserr.TranslateNetstackError(s.Endpoint.Bind(addr)) @@ -1260,6 +1290,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return int32(time.Duration(v) / time.Second), nil + case linux.TCP_DEFER_ACCEPT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.TCPDeferAcceptOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return int32(time.Duration(v) / time.Second), nil + default: emitUnimplementedEventTCP(t, name) } @@ -1306,6 +1348,22 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf } return ib, nil + case linux.IPV6_RECVTCLASS: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveTClassOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + var o int32 + if v { + o = 1 + } + return o, nil + default: emitUnimplementedEventIPv6(t, name) } @@ -1402,6 +1460,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in } return o, nil + case linux.IP_PKTINFO: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.ReceiveIPPacketInfoOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + var o int32 + if v { + o = 1 + } + return o, nil + default: emitUnimplementedEventIP(t, name) } @@ -1713,6 +1786,16 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * v := usermem.ByteOrder.Uint32(optVal) return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)))) + case linux.TCP_DEFER_ACCEPT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + v := int32(usermem.ByteOrder.Uint32(optVal)) + if v < 0 { + v = 0 + } + return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + case linux.TCP_REPAIR_OPTIONS: t.Kernel().EmitUnimplementedEvent(t) @@ -1740,6 +1823,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) linux.IPV6_IPSEC_POLICY, linux.IPV6_JOIN_ANYCAST, linux.IPV6_LEAVE_ANYCAST, + // TODO(b/148887420): Add support for IPV6_PKTINFO. linux.IPV6_PKTINFO, linux.IPV6_ROUTER_ALERT, linux.IPV6_XFRM_POLICY, @@ -1765,6 +1849,14 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) } return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv6TrafficClassOption(v))) + case linux.IPV6_RECVTCLASS: + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0)) + default: emitUnimplementedEventIPv6(t, name) } @@ -1927,6 +2019,16 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s } return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0)) + case linux.IP_PKTINFO: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0)) + case linux.IP_ADD_SOURCE_MEMBERSHIP, linux.IP_BIND_ADDRESS_NO_PORT, linux.IP_BLOCK_SOURCE, @@ -1942,7 +2044,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s linux.IP_NODEFRAG, linux.IP_OPTIONS, linux.IP_PASSSEC, - linux.IP_PKTINFO, linux.IP_RECVERR, linux.IP_RECVFRAGSIZE, linux.IP_RECVOPTS, @@ -2039,7 +2140,6 @@ func emitUnimplementedEventIPv6(t *kernel.Task, name int) { linux.IPV6_RECVPATHMTU, linux.IPV6_RECVPKTINFO, linux.IPV6_RECVRTHDR, - linux.IPV6_RECVTCLASS, linux.IPV6_RTHDR, linux.IPV6_RTHDRDSTOPTS, linux.IPV6_TCLASS, @@ -2207,11 +2307,16 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq var copied int // Copy as many views as possible into the user-provided buffer. - for dst.NumBytes() != 0 { + for { + // Always do at least one fetchReadView, even if the number of bytes to + // read is 0. err = s.fetchReadView() if err != nil { break } + if dst.NumBytes() == 0 { + break + } var n int var e error @@ -2368,10 +2473,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe func (s *SocketOperations) controlMessages() socket.ControlMessages { return socket.ControlMessages{ IP: tcpip.ControlMessages{ - HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, - Timestamp: s.readCM.Timestamp, - HasTOS: s.readCM.HasTOS, - TOS: s.readCM.TOS, + HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, + Timestamp: s.readCM.Timestamp, + HasTOS: s.readCM.HasTOS, + TOS: s.readCM.TOS, + HasTClass: s.readCM.HasTClass, + TClass: s.readCM.TClass, + HasIPPacketInfo: s.readCM.HasIPPacketInfo, + PacketInfo: s.readCM.PacketInfo, }, } } @@ -2558,7 +2667,9 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, } // Add bytes removed from the endpoint but not yet sent to the caller. + s.readMu.Lock() v += len(s.readView) + s.readMu.Unlock() if v > math.MaxInt32 { v = math.MaxInt32 diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index 5afff2564..5f181f017 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -75,6 +75,8 @@ func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol in switch protocol { case syscall.IPPROTO_ICMP: return header.ICMPv4ProtocolNumber, true, nil + case syscall.IPPROTO_ICMPV6: + return header.ICMPv6ProtocolNumber, true, nil case syscall.IPPROTO_UDP: return header.UDPProtocolNumber, true, nil case syscall.IPPROTO_TCP: diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 31ea66eca..0692482e9 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -20,6 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -88,6 +90,59 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { return nicAddrs } +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + var ( + protocol tcpip.NetworkProtocolNumber + address tcpip.Address + ) + switch addr.Family { + case linux.AF_INET: + if len(addr.Addr) < header.IPv4AddressSize { + return syserror.EINVAL + } + if addr.PrefixLen > header.IPv4AddressSize*8 { + return syserror.EINVAL + } + protocol = ipv4.ProtocolNumber + address = tcpip.Address(addr.Addr[:header.IPv4AddressSize]) + + case linux.AF_INET6: + if len(addr.Addr) < header.IPv6AddressSize { + return syserror.EINVAL + } + if addr.PrefixLen > header.IPv6AddressSize*8 { + return syserror.EINVAL + } + protocol = ipv6.ProtocolNumber + address = tcpip.Address(addr.Addr[:header.IPv6AddressSize]) + + default: + return syserror.ENOTSUP + } + + protocolAddress := tcpip.ProtocolAddress{ + Protocol: protocol, + AddressWithPrefix: tcpip.AddressWithPrefix{ + Address: address, + PrefixLen: int(addr.PrefixLen), + }, + } + + // Attach address to interface. + if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + return syserr.TranslateNetstackError(err).ToError() + } + + // Add route for local network. + s.Stack.AddRoute(tcpip.Route{ + Destination: protocolAddress.AddressWithPrefix.Subnet(), + Gateway: "", // No gateway for local network. + NIC: tcpip.NICID(idx), + }) + return nil +} + // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { var rs tcp.ReceiveBufferSizeOption diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index 762a946fe..88d5db9fc 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "capability.go", "clone.go", + "epoll.go", "futex.go", "linux64_amd64.go", "linux64_arm64.go", @@ -30,7 +31,6 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", - "//pkg/sentry/socket/control", "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netstack", "//pkg/sentry/syscalls/linux", diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go new file mode 100644 index 000000000..a6e48b836 --- /dev/null +++ b/pkg/sentry/strace/epoll.go @@ -0,0 +1,89 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strace + +import ( + "fmt" + "strings" + + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string { + var e linux.EpollEvent + if _, err := t.CopyIn(eventAddr, &e); err != nil { + return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err) + } + var sb strings.Builder + fmt.Fprintf(&sb, "%#x ", eventAddr) + writeEpollEvent(&sb, e) + return sb.String() +} + +func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes uint64) string { + var sb strings.Builder + fmt.Fprintf(&sb, "%#x {", eventsAddr) + addr := eventsAddr + for i := uint64(0); i < numEvents; i++ { + var e linux.EpollEvent + if _, err := t.CopyIn(addr, &e); err != nil { + fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err) + continue + } + writeEpollEvent(&sb, e) + if uint64(sb.Len()) >= maxBytes { + sb.WriteString("...") + break + } + if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok { + fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr) + continue + } + } + sb.WriteString("}") + return sb.String() +} + +func writeEpollEvent(sb *strings.Builder, e linux.EpollEvent) { + events := epollEventEvents.Parse(uint64(e.Events)) + fmt.Fprintf(sb, "{events=%s data=[%#x, %#x]}", events, e.Data[0], e.Data[1]) +} + +var epollCtlOps = abi.ValueSet{ + linux.EPOLL_CTL_ADD: "EPOLL_CTL_ADD", + linux.EPOLL_CTL_DEL: "EPOLL_CTL_DEL", + linux.EPOLL_CTL_MOD: "EPOLL_CTL_MOD", +} + +var epollEventEvents = abi.FlagSet{ + {Flag: linux.EPOLLIN, Name: "EPOLLIN"}, + {Flag: linux.EPOLLPRI, Name: "EPOLLPRI"}, + {Flag: linux.EPOLLOUT, Name: "EPOLLOUT"}, + {Flag: linux.EPOLLERR, Name: "EPOLLERR"}, + {Flag: linux.EPOLLHUP, Name: "EPULLHUP"}, + {Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"}, + {Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"}, + {Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"}, + {Flag: linux.EPOLLWRBAND, Name: "EPOLLWRBAND"}, + {Flag: linux.EPOLLMSG, Name: "EPOLLMSG"}, + {Flag: linux.EPOLLRDHUP, Name: "EPOLLRDHUP"}, + {Flag: linux.EPOLLEXCLUSIVE, Name: "EPOLLEXCLUSIVE"}, + {Flag: linux.EPOLLWAKEUP, Name: "EPOLLWAKEUP"}, + {Flag: linux.EPOLLONESHOT, Name: "EPOLLONESHOT"}, + {Flag: linux.EPOLLET, Name: "EPOLLET"}, +} diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index 85ec66fd3..71b92eaee 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -78,8 +78,8 @@ var linuxAMD64 = SyscallMap{ 51: makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen), 52: makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen), 53: makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex), - 54: makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex), - 55: makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex), + 54: makeSyscallInfo("setsockopt", FD, SockOptLevel, SockOptName, SetSockOptVal, Hex /* length by value, not a pointer */), + 55: makeSyscallInfo("getsockopt", FD, SockOptLevel, SockOptName, GetSockOptVal, SockLen), 56: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex), 57: makeSyscallInfo("fork"), 58: makeSyscallInfo("vfork"), @@ -256,8 +256,8 @@ var linuxAMD64 = SyscallMap{ 229: makeSyscallInfo("clock_getres", Hex, PostTimespec), 230: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec), 231: makeSyscallInfo("exit_group", Hex), - 232: makeSyscallInfo("epoll_wait", Hex, Hex, Hex, Hex), - 233: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex), + 232: makeSyscallInfo("epoll_wait", FD, EpollEvents, Hex, Hex), + 233: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent), 234: makeSyscallInfo("tgkill", Hex, Hex, Signal), 235: makeSyscallInfo("utimes", Path, Timeval), // 236: vserver (not implemented in the Linux kernel) @@ -305,7 +305,7 @@ var linuxAMD64 = SyscallMap{ 278: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex), 279: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex), 280: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex), - 281: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex), + 281: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex), 282: makeSyscallInfo("signalfd", Hex, Hex, Hex), 283: makeSyscallInfo("timerfd_create", Hex, Hex), 284: makeSyscallInfo("eventfd", Hex), diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go index 8bc38545f..bd7361a52 100644 --- a/pkg/sentry/strace/linux64_arm64.go +++ b/pkg/sentry/strace/linux64_arm64.go @@ -45,8 +45,8 @@ var linuxARM64 = SyscallMap{ 18: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex), 19: makeSyscallInfo("eventfd2", Hex, Hex), 20: makeSyscallInfo("epoll_create1", Hex), - 21: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex), - 22: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex), + 21: makeSyscallInfo("epoll_ctl", FD, EpollCtlOp, FD, EpollEvent), + 22: makeSyscallInfo("epoll_pwait", FD, EpollEvents, Hex, Hex, SigSet, Hex), 23: makeSyscallInfo("dup", FD), 24: makeSyscallInfo("dup3", FD, FD, Hex), 25: makeSyscallInfo("fcntl", FD, Hex, Hex), diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index d2079c85f..c0512de89 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" @@ -220,13 +219,13 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) if skipData { strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length)) - i += control.AlignUp(length, width) + i += binary.AlignUp(length, width) continue } switch h.Type { case linux.SCM_RIGHTS: - rightsSize := control.AlignDown(length, linux.SizeOfControlMessageRight) + rightsSize := binary.AlignDown(length, linux.SizeOfControlMessageRight) numRights := rightsSize / linux.SizeOfControlMessageRight fds := make(linux.ControlMessageRights, numRights) @@ -295,7 +294,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) default: panic("unreachable") } - i += control.AlignUp(length, width) + i += binary.AlignUp(length, width) } return fmt.Sprintf("%#x %s", addr, strings.Join(strs, ", ")) @@ -419,3 +418,227 @@ func sockFlags(flags int32) string { } return SocketFlagSet.Parse(uint64(flags)) } + +func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen usermem.Addr, maximumBlobSize uint, rval uintptr) string { + if int(rval) < 0 { + return hexNum(uint64(optVal)) + } + if optVal == 0 { + return "null" + } + l, err := copySockLen(t, optLen) + if err != nil { + return fmt.Sprintf("%#x {error reading length: %v}", optLen, err) + } + return sockOptVal(t, level, optname, optVal, uint64(l), maximumBlobSize) +} + +func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string { + switch optLen { + case 1: + var v uint8 + _, err := t.CopyIn(optVal, &v) + if err != nil { + return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) + } + return fmt.Sprintf("%#x {value=%v}", optVal, v) + case 2: + var v uint16 + _, err := t.CopyIn(optVal, &v) + if err != nil { + return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) + } + return fmt.Sprintf("%#x {value=%v}", optVal, v) + case 4: + var v uint32 + _, err := t.CopyIn(optVal, &v) + if err != nil { + return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) + } + return fmt.Sprintf("%#x {value=%v}", optVal, v) + default: + return dump(t, optVal, uint(optLen), maximumBlobSize) + } +} + +var sockOptLevels = abi.ValueSet{ + linux.SOL_IP: "SOL_IP", + linux.SOL_SOCKET: "SOL_SOCKET", + linux.SOL_TCP: "SOL_TCP", + linux.SOL_UDP: "SOL_UDP", + linux.SOL_IPV6: "SOL_IPV6", + linux.SOL_ICMPV6: "SOL_ICMPV6", + linux.SOL_RAW: "SOL_RAW", + linux.SOL_PACKET: "SOL_PACKET", + linux.SOL_NETLINK: "SOL_NETLINK", +} + +var sockOptNames = map[uint64]abi.ValueSet{ + linux.SOL_IP: { + linux.IP_TTL: "IP_TTL", + linux.IP_MULTICAST_TTL: "IP_MULTICAST_TTL", + linux.IP_MULTICAST_IF: "IP_MULTICAST_IF", + linux.IP_MULTICAST_LOOP: "IP_MULTICAST_LOOP", + linux.IP_TOS: "IP_TOS", + linux.IP_RECVTOS: "IP_RECVTOS", + linux.IPT_SO_GET_INFO: "IPT_SO_GET_INFO", + linux.IPT_SO_GET_ENTRIES: "IPT_SO_GET_ENTRIES", + linux.IP_ADD_MEMBERSHIP: "IP_ADD_MEMBERSHIP", + linux.IP_DROP_MEMBERSHIP: "IP_DROP_MEMBERSHIP", + linux.MCAST_JOIN_GROUP: "MCAST_JOIN_GROUP", + linux.IP_ADD_SOURCE_MEMBERSHIP: "IP_ADD_SOURCE_MEMBERSHIP", + linux.IP_BIND_ADDRESS_NO_PORT: "IP_BIND_ADDRESS_NO_PORT", + linux.IP_BLOCK_SOURCE: "IP_BLOCK_SOURCE", + linux.IP_CHECKSUM: "IP_CHECKSUM", + linux.IP_DROP_SOURCE_MEMBERSHIP: "IP_DROP_SOURCE_MEMBERSHIP", + linux.IP_FREEBIND: "IP_FREEBIND", + linux.IP_HDRINCL: "IP_HDRINCL", + linux.IP_IPSEC_POLICY: "IP_IPSEC_POLICY", + linux.IP_MINTTL: "IP_MINTTL", + linux.IP_MSFILTER: "IP_MSFILTER", + linux.IP_MTU_DISCOVER: "IP_MTU_DISCOVER", + linux.IP_MULTICAST_ALL: "IP_MULTICAST_ALL", + linux.IP_NODEFRAG: "IP_NODEFRAG", + linux.IP_OPTIONS: "IP_OPTIONS", + linux.IP_PASSSEC: "IP_PASSSEC", + linux.IP_PKTINFO: "IP_PKTINFO", + linux.IP_RECVERR: "IP_RECVERR", + linux.IP_RECVFRAGSIZE: "IP_RECVFRAGSIZE", + linux.IP_RECVOPTS: "IP_RECVOPTS", + linux.IP_RECVORIGDSTADDR: "IP_RECVORIGDSTADDR", + linux.IP_RECVTTL: "IP_RECVTTL", + linux.IP_RETOPTS: "IP_RETOPTS", + linux.IP_TRANSPARENT: "IP_TRANSPARENT", + linux.IP_UNBLOCK_SOURCE: "IP_UNBLOCK_SOURCE", + linux.IP_UNICAST_IF: "IP_UNICAST_IF", + linux.IP_XFRM_POLICY: "IP_XFRM_POLICY", + linux.MCAST_BLOCK_SOURCE: "MCAST_BLOCK_SOURCE", + linux.MCAST_JOIN_SOURCE_GROUP: "MCAST_JOIN_SOURCE_GROUP", + linux.MCAST_LEAVE_GROUP: "MCAST_LEAVE_GROUP", + linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP", + linux.MCAST_MSFILTER: "MCAST_MSFILTER", + linux.MCAST_UNBLOCK_SOURCE: "MCAST_UNBLOCK_SOURCE", + linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT", + linux.IP_PKTOPTIONS: "IP_PKTOPTIONS", + linux.IP_MTU: "IP_MTU", + }, + linux.SOL_SOCKET: { + linux.SO_ERROR: "SO_ERROR", + linux.SO_PEERCRED: "SO_PEERCRED", + linux.SO_PASSCRED: "SO_PASSCRED", + linux.SO_SNDBUF: "SO_SNDBUF", + linux.SO_RCVBUF: "SO_RCVBUF", + linux.SO_REUSEADDR: "SO_REUSEADDR", + linux.SO_REUSEPORT: "SO_REUSEPORT", + linux.SO_BINDTODEVICE: "SO_BINDTODEVICE", + linux.SO_BROADCAST: "SO_BROADCAST", + linux.SO_KEEPALIVE: "SO_KEEPALIVE", + linux.SO_LINGER: "SO_LINGER", + linux.SO_SNDTIMEO: "SO_SNDTIMEO", + linux.SO_RCVTIMEO: "SO_RCVTIMEO", + linux.SO_OOBINLINE: "SO_OOBINLINE", + linux.SO_TIMESTAMP: "SO_TIMESTAMP", + }, + linux.SOL_TCP: { + linux.TCP_NODELAY: "TCP_NODELAY", + linux.TCP_CORK: "TCP_CORK", + linux.TCP_QUICKACK: "TCP_QUICKACK", + linux.TCP_MAXSEG: "TCP_MAXSEG", + linux.TCP_KEEPIDLE: "TCP_KEEPIDLE", + linux.TCP_KEEPINTVL: "TCP_KEEPINTVL", + linux.TCP_USER_TIMEOUT: "TCP_USER_TIMEOUT", + linux.TCP_INFO: "TCP_INFO", + linux.TCP_CC_INFO: "TCP_CC_INFO", + linux.TCP_NOTSENT_LOWAT: "TCP_NOTSENT_LOWAT", + linux.TCP_ZEROCOPY_RECEIVE: "TCP_ZEROCOPY_RECEIVE", + linux.TCP_CONGESTION: "TCP_CONGESTION", + linux.TCP_LINGER2: "TCP_LINGER2", + linux.TCP_DEFER_ACCEPT: "TCP_DEFER_ACCEPT", + linux.TCP_REPAIR_OPTIONS: "TCP_REPAIR_OPTIONS", + linux.TCP_INQ: "TCP_INQ", + linux.TCP_FASTOPEN: "TCP_FASTOPEN", + linux.TCP_FASTOPEN_CONNECT: "TCP_FASTOPEN_CONNECT", + linux.TCP_FASTOPEN_KEY: "TCP_FASTOPEN_KEY", + linux.TCP_FASTOPEN_NO_COOKIE: "TCP_FASTOPEN_NO_COOKIE", + linux.TCP_KEEPCNT: "TCP_KEEPCNT", + linux.TCP_QUEUE_SEQ: "TCP_QUEUE_SEQ", + linux.TCP_REPAIR: "TCP_REPAIR", + linux.TCP_REPAIR_QUEUE: "TCP_REPAIR_QUEUE", + linux.TCP_REPAIR_WINDOW: "TCP_REPAIR_WINDOW", + linux.TCP_SAVED_SYN: "TCP_SAVED_SYN", + linux.TCP_SAVE_SYN: "TCP_SAVE_SYN", + linux.TCP_SYNCNT: "TCP_SYNCNT", + linux.TCP_THIN_DUPACK: "TCP_THIN_DUPACK", + linux.TCP_THIN_LINEAR_TIMEOUTS: "TCP_THIN_LINEAR_TIMEOUTS", + linux.TCP_TIMESTAMP: "TCP_TIMESTAMP", + linux.TCP_ULP: "TCP_ULP", + linux.TCP_WINDOW_CLAMP: "TCP_WINDOW_CLAMP", + }, + linux.SOL_IPV6: { + linux.IPV6_V6ONLY: "IPV6_V6ONLY", + linux.IPV6_PATHMTU: "IPV6_PATHMTU", + linux.IPV6_TCLASS: "IPV6_TCLASS", + linux.IPV6_ADD_MEMBERSHIP: "IPV6_ADD_MEMBERSHIP", + linux.IPV6_DROP_MEMBERSHIP: "IPV6_DROP_MEMBERSHIP", + linux.IPV6_IPSEC_POLICY: "IPV6_IPSEC_POLICY", + linux.IPV6_JOIN_ANYCAST: "IPV6_JOIN_ANYCAST", + linux.IPV6_LEAVE_ANYCAST: "IPV6_LEAVE_ANYCAST", + linux.IPV6_PKTINFO: "IPV6_PKTINFO", + linux.IPV6_ROUTER_ALERT: "IPV6_ROUTER_ALERT", + linux.IPV6_XFRM_POLICY: "IPV6_XFRM_POLICY", + linux.MCAST_BLOCK_SOURCE: "MCAST_BLOCK_SOURCE", + linux.MCAST_JOIN_GROUP: "MCAST_JOIN_GROUP", + linux.MCAST_JOIN_SOURCE_GROUP: "MCAST_JOIN_SOURCE_GROUP", + linux.MCAST_LEAVE_GROUP: "MCAST_LEAVE_GROUP", + linux.MCAST_LEAVE_SOURCE_GROUP: "MCAST_LEAVE_SOURCE_GROUP", + linux.MCAST_UNBLOCK_SOURCE: "MCAST_UNBLOCK_SOURCE", + linux.IPV6_2292DSTOPTS: "IPV6_2292DSTOPTS", + linux.IPV6_2292HOPLIMIT: "IPV6_2292HOPLIMIT", + linux.IPV6_2292HOPOPTS: "IPV6_2292HOPOPTS", + linux.IPV6_2292PKTINFO: "IPV6_2292PKTINFO", + linux.IPV6_2292PKTOPTIONS: "IPV6_2292PKTOPTIONS", + linux.IPV6_2292RTHDR: "IPV6_2292RTHDR", + linux.IPV6_ADDR_PREFERENCES: "IPV6_ADDR_PREFERENCES", + linux.IPV6_AUTOFLOWLABEL: "IPV6_AUTOFLOWLABEL", + linux.IPV6_DONTFRAG: "IPV6_DONTFRAG", + linux.IPV6_DSTOPTS: "IPV6_DSTOPTS", + linux.IPV6_FLOWINFO: "IPV6_FLOWINFO", + linux.IPV6_FLOWINFO_SEND: "IPV6_FLOWINFO_SEND", + linux.IPV6_FLOWLABEL_MGR: "IPV6_FLOWLABEL_MGR", + linux.IPV6_FREEBIND: "IPV6_FREEBIND", + linux.IPV6_HOPOPTS: "IPV6_HOPOPTS", + linux.IPV6_MINHOPCOUNT: "IPV6_MINHOPCOUNT", + linux.IPV6_MTU: "IPV6_MTU", + linux.IPV6_MTU_DISCOVER: "IPV6_MTU_DISCOVER", + linux.IPV6_MULTICAST_ALL: "IPV6_MULTICAST_ALL", + linux.IPV6_MULTICAST_HOPS: "IPV6_MULTICAST_HOPS", + linux.IPV6_MULTICAST_IF: "IPV6_MULTICAST_IF", + linux.IPV6_MULTICAST_LOOP: "IPV6_MULTICAST_LOOP", + linux.IPV6_RECVDSTOPTS: "IPV6_RECVDSTOPTS", + linux.IPV6_RECVERR: "IPV6_RECVERR", + linux.IPV6_RECVFRAGSIZE: "IPV6_RECVFRAGSIZE", + linux.IPV6_RECVHOPLIMIT: "IPV6_RECVHOPLIMIT", + linux.IPV6_RECVHOPOPTS: "IPV6_RECVHOPOPTS", + linux.IPV6_RECVORIGDSTADDR: "IPV6_RECVORIGDSTADDR", + linux.IPV6_RECVPATHMTU: "IPV6_RECVPATHMTU", + linux.IPV6_RECVPKTINFO: "IPV6_RECVPKTINFO", + linux.IPV6_RECVRTHDR: "IPV6_RECVRTHDR", + linux.IPV6_RECVTCLASS: "IPV6_RECVTCLASS", + linux.IPV6_RTHDR: "IPV6_RTHDR", + linux.IPV6_RTHDRDSTOPTS: "IPV6_RTHDRDSTOPTS", + linux.IPV6_TRANSPARENT: "IPV6_TRANSPARENT", + linux.IPV6_UNICAST_HOPS: "IPV6_UNICAST_HOPS", + linux.IPV6_UNICAST_IF: "IPV6_UNICAST_IF", + linux.MCAST_MSFILTER: "MCAST_MSFILTER", + linux.IPV6_ADDRFORM: "IPV6_ADDRFORM", + }, + linux.SOL_NETLINK: { + linux.NETLINK_BROADCAST_ERROR: "NETLINK_BROADCAST_ERROR", + linux.NETLINK_CAP_ACK: "NETLINK_CAP_ACK", + linux.NETLINK_DUMP_STRICT_CHK: "NETLINK_DUMP_STRICT_CHK", + linux.NETLINK_EXT_ACK: "NETLINK_EXT_ACK", + linux.NETLINK_LIST_MEMBERSHIPS: "NETLINK_LIST_MEMBERSHIPS", + linux.NETLINK_NO_ENOBUFS: "NETLINK_NO_ENOBUFS", + linux.NETLINK_PKTINFO: "NETLINK_PKTINFO", + }, +} diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 3fc4a47fc..77655558e 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -55,6 +55,14 @@ var ItimerTypes = abi.ValueSet{ linux.ITIMER_PROF: "ITIMER_PROF", } +func hexNum(num uint64) string { + return "0x" + strconv.FormatUint(num, 16) +} + +func hexArg(arg arch.SyscallArgument) string { + return hexNum(arg.Uint64()) +} + func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, maxBytes uint64) string { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return fmt.Sprintf("%#x (error decoding iovecs: invalid iovcnt)", addr) @@ -133,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string { } func fd(t *kernel.Task, fd int32) string { + if kernel.VFS2Enabled { + return fdVFS2(t, fd) + } + root := t.FSContext().RootDirectory() if root != nil { defer root.DecRef() @@ -161,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string { return fmt.Sprintf("%#x %s", fd, name) } +func fdVFS2(t *kernel.Task, fd int32) string { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + + vfsObj := root.Mount().Filesystem().VirtualFilesystem() + if fd == linux.AT_FDCWD { + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + + name, _ := vfsObj.PathnameWithDeleted(t, root, wd) + return fmt.Sprintf("AT_FDCWD %s", name) + } + + file := t.GetFileVFS2(fd) + if file == nil { + // Cast FD to uint64 to avoid printing negative hex. + return fmt.Sprintf("%#x (bad FD)", uint64(fd)) + } + defer file.DecRef() + + name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) + return fmt.Sprintf("%#x %s", fd, name) +} + func fdpair(t *kernel.Task, addr usermem.Addr) string { var fds [2]int32 _, err := t.CopyIn(addr, &fds) @@ -389,6 +425,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo output = append(output, path(t, args[arg].Pointer())) case ExecveStringVector: output = append(output, stringVector(t, args[arg].Pointer())) + case SetSockOptVal: + output = append(output, sockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Uint64() /* optLen */, maximumBlobSize)) + case SockOptLevel: + output = append(output, sockOptLevels.Parse(args[arg].Uint64())) + case SockOptName: + output = append(output, sockOptNames[args[arg-1].Uint64() /* level */].Parse(args[arg].Uint64())) case SockAddr: output = append(output, sockAddr(t, args[arg].Pointer(), uint32(args[arg+1].Uint64()))) case SockLen: @@ -439,6 +481,12 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer())) case PollFDs: output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false)) + case EpollCtlOp: + output = append(output, epollCtlOps.Parse(uint64(args[arg].Int()))) + case EpollEvent: + output = append(output, epollEvent(t, args[arg].Pointer())) + case EpollEvents: + output = append(output, epollEvents(t, args[arg].Pointer(), 0 /* numEvents */, uint64(maximumBlobSize))) case SelectFDSet: output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer())) case Oct: @@ -446,7 +494,7 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo case Hex: fallthrough default: - output = append(output, "0x"+strconv.FormatUint(args[arg].Uint64(), 16)) + output = append(output, hexArg(args[arg])) } } @@ -507,6 +555,14 @@ func (i *SyscallInfo) post(t *kernel.Task, args arch.SyscallArguments, rval uint output[arg] = capData(t, args[arg-1].Pointer(), args[arg].Pointer()) case PollFDs: output[arg] = pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), true) + case EpollEvents: + output[arg] = epollEvents(t, args[arg].Pointer(), uint64(rval), uint64(maximumBlobSize)) + case GetSockOptVal: + output[arg] = getSockOptVal(t, args[arg-2].Uint64() /* level */, args[arg-1].Uint64() /* optName */, args[arg].Pointer() /* optVal */, args[arg+1].Pointer() /* optLen */, maximumBlobSize, rval) + case SetSockOptVal: + // No need to print the value again. While it usually + // isn't, the string version of this arg can be long. + output[arg] = hexArg(args[arg]) } } } diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go index 24e29a2ba..7e69b9279 100644 --- a/pkg/sentry/strace/syscalls.go +++ b/pkg/sentry/strace/syscalls.go @@ -207,9 +207,37 @@ const ( // array is in the next argument. PollFDs - // SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of - // fds represented must be the first argument. + // SelectFDSet is an fd_set argument in select(2)/pselect(2). The + // number of FDs represented must be the first argument. SelectFDSet + + // GetSockOptVal is the optval argument in getsockopt(2). + // + // Formatted after syscall execution. + GetSockOptVal + + // SetSockOptVal is the optval argument in setsockopt(2). + // + // Contents omitted after syscall execution. + SetSockOptVal + + // SockOptLevel is the level argument in getsockopt(2) and + // setsockopt(2). + SockOptLevel + + // SockOptLevel is the optname argument in getsockopt(2) and + // setsockopt(2). + SockOptName + + // EpollCtlOp is the op argument to epoll_ctl(2). + EpollCtlOp + + // EpollEvent is the event argument in epoll_ctl(2). + EpollEvent + + // EpollEvents is an array of struct epoll_event. It is the events + // argument in epoll_wait(2)/epoll_pwait(2). + EpollEvents ) // defaultFormat is the syscall argument format to use if the actual format is diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index be16ee686..0d24fd3c4 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -74,6 +74,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index 7435b50bf..79066ad2a 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -234,12 +234,12 @@ var AMD64 = &kernel.SyscallTable{ 191: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), 192: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), 193: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 195: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 196: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 197: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 198: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 199: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 194: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 195: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 196: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 197: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 198: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 199: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), 200: syscalls.Supported("tkill", Tkill), 201: syscalls.Supported("time", Time), 202: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index 03a39fe65..7421619de 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -47,12 +47,12 @@ var ARM64 = &kernel.SyscallTable{ 8: syscalls.PartiallySupported("getxattr", GetXattr, "Only supported for tmpfs.", nil), 9: syscalls.PartiallySupported("lgetxattr", LGetXattr, "Only supported for tmpfs.", nil), 10: syscalls.PartiallySupported("fgetxattr", FGetXattr, "Only supported for tmpfs.", nil), - 11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 12: syscalls.ErrorWithEvent("llistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 13: syscalls.ErrorWithEvent("flistxattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 14: syscalls.ErrorWithEvent("removexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 15: syscalls.ErrorWithEvent("lremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), - 16: syscalls.ErrorWithEvent("fremovexattr", syserror.ENOTSUP, "Requires filesystem support.", nil), + 11: syscalls.PartiallySupported("listxattr", ListXattr, "Only supported for tmpfs", nil), + 12: syscalls.PartiallySupported("llistxattr", LListXattr, "Only supported for tmpfs", nil), + 13: syscalls.PartiallySupported("flistxattr", FListXattr, "Only supported for tmpfs", nil), + 14: syscalls.PartiallySupported("removexattr", RemoveXattr, "Only supported for tmpfs", nil), + 15: syscalls.PartiallySupported("lremovexattr", LRemoveXattr, "Only supported for tmpfs", nil), + 16: syscalls.PartiallySupported("fremovexattr", FRemoveXattr, "Only supported for tmpfs", nil), 17: syscalls.Supported("getcwd", Getcwd), 18: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil), 19: syscalls.Supported("eventfd2", Eventfd2), diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 5f11b496c..3ab93fbde 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // EpollCreate1 implements the epoll_create1(2) linux syscall. func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() @@ -83,8 +85,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } mask = waiter.EventMaskFromLinux(e.Events) - data[0] = e.Fd - data[1] = e.Data + data = e.Data } // Perform the requested operations. @@ -165,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return EpollWait(t, args) } + +// LINT.ThenChange(vfs2/epoll.go) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 421845ebb..d10a9bed8 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string return path, dirPath, nil } +// LINT.IfChange + func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +// LINT.ThenChange(vfs2/ioctl.go) + +// LINT.IfChange + // Getcwd implements the linux syscall getcwd(2). func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil } +// LINT.ThenChange(vfs2/fscontext.go) + +// LINT.IfChange + // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +// LINT.ThenChange(vfs2/fd.go) + const ( _FADV_NORMAL = 0 _FADV_RANDOM = 1 @@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.IfChange + func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1218,7 +1236,7 @@ func rmdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { return syserror.ENOTEMPTY } - if err := fs.MayDelete(t, root, d, name); err != nil { + if err := d.MayDelete(t, root, name); err != nil { return err } @@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return n, nil, err } +// LINT.ThenChange(vfs2/stat.go) + +// LINT.IfChange + func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1491,7 +1517,7 @@ func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { return syserror.ENOTDIR } - if err := fs.MayDelete(t, root, d, name); err != nil { + if err := d.MayDelete(t, root, name); err != nil { return err } @@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, unlinkAt(t, dirFD, addr) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + // Truncate implements linux syscall truncate(2). func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.ThenChange(vfs2/setstat.go) + // Umask implements linux syscall umask(2). func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mask := args[0].ModeT() @@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(mask), nil, nil } +// LINT.IfChange + // Change ownership of a file. // // uid and gid may be -1, in which case they will not be changed. @@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } +// LINT.ThenChange(vfs2/setstat.go) + +// LINT.IfChange + func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { @@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } +// LINT.ThenChange(vfs2/filesystem.go) + // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index f66f4ffde..b126fecc0 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -27,6 +27,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // Getdents implements linux syscall getdents(2) for 64bit systems. func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error { func (ds *direntSerializer) Written() int { return ds.written } + +// LINT.ThenChange(vfs2/getdents.go) diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 297e920c4..3f7691eae 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // Lseek implements linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return uintptr(offset), nil, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 9959f6e61..91694d374 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo return uintptr(addr), nil, nil } +// LINT.IfChange + // Mmap implements linux syscall mmap(2). func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() @@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(rv), nil, err } +// LINT.ThenChange(vfs2/mmap.go) + // Munmap implements linux syscall munmap(2). func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 98db32d77..9c6728530 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(file.Dirent) + t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 227692f06..78a2cb750 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // EventMaskRead contains events that can be triggered on reads. EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr @@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i return total, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index f43d6c155..fd642834b 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -25,6 +25,10 @@ import ( // doSplice implements a blocking splice operation. func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonBlocking bool) (int64, error) { + if opts.Length < 0 || opts.SrcStart < 0 || opts.DstStart < 0 { + return 0, syserror.EINVAL + } + var ( total int64 n int64 @@ -82,11 +86,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc offsetAddr := args[2].Pointer() count := int64(args[3].SizeT()) - // Don't send a negative number of bytes. - if count < 0 { - return 0, nil, syserror.EINVAL - } - // Get files. inFile := t.GetFile(inFD) if inFile == nil { @@ -136,11 +135,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - // The offset must be valid. - if offset < 0 { - return 0, nil, syserror.EINVAL - } - // Do the splice. n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{ Length: count, @@ -227,6 +221,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if _, err := t.CopyIn(outOffset, &offset); err != nil { return 0, nil, err } + // Use the destination offset. opts.DstOffset = true opts.DstStart = offset @@ -244,6 +239,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if _, err := t.CopyIn(inOffset, &offset); err != nil { return 0, nil, err } + // Use the source offset. opts.SrcOffset = true opts.SrcStart = offset diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index c841abccb..9bd2df104 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // Stat implements linux syscall stat(2). func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -112,7 +114,8 @@ func stat(t *kernel.Task, d *fs.Dirent, dirPath bool, statAddr usermem.Addr) err if err != nil { return err } - return copyOutStat(t, statAddr, d.Inode.StableAttr, uattr) + s := statFromAttrs(t, d.Inode.StableAttr, uattr) + return s.CopyOut(t, statAddr) } // fstat implements fstat for the given *fs.File. @@ -121,7 +124,8 @@ func fstat(t *kernel.Task, f *fs.File, statAddr usermem.Addr) error { if err != nil { return err } - return copyOutStat(t, statAddr, f.Dirent.Inode.StableAttr, uattr) + s := statFromAttrs(t, f.Dirent.Inode.StableAttr, uattr) + return s.CopyOut(t, statAddr) } // Statx implements linux syscall statx(2). @@ -277,3 +281,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error { _, err = t.CopyOut(addr, &statfs) return err } + +// LINT.ThenChange(vfs2/stat.go) diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go index 75a567bd4..0a04a6113 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -12,64 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build amd64 +// +build amd64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" ) -// copyOutStat copies the attributes (sattr, uattr) to the struct stat at -// address dst in t's address space. It encodes the stat struct to bytes -// manually, as stat() is a very common syscall for many applications, and -// t.CopyObjectOut has noticeable performance impact due to its many slice -// allocations and use of reflection. -func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { - b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] - - // Dev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) - // Ino (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) - // Nlink (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uattr.Links) - // Mode (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) - // UID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - // GID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) - // Padding (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, 0) - // Rdev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) - // Size (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) - // Blksize (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.BlockSize)) - // Blocks (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) - - // ATime - atime := uattr.AccessTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) - - // MTime - mtime := uattr.ModificationTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) - - // CTime - ctime := uattr.StatusChangeTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) - - _, err := t.CopyOutBytes(dst, b) - return err +// LINT.IfChange + +func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { + return linux.Stat{ + Dev: sattr.DeviceID, + Ino: sattr.InodeID, + Nlink: uattr.Links, + Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)), + Size: uattr.Size, + Blksize: sattr.BlockSize, + Blocks: uattr.Usage / 512, + ATime: uattr.AccessTime.Timespec(), + MTime: uattr.ModificationTime.Timespec(), + CTime: uattr.StatusChangeTime.Timespec(), + } } + +// LINT.ThenChange(vfs2/stat_amd64.go) diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go index 80c98d05c..5a3b1bfad 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -12,66 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -//+build arm64 +// +build arm64 package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" ) -// copyOutStat copies the attributes (sattr, uattr) to the struct stat at -// address dst in t's address space. It encodes the stat struct to bytes -// manually, as stat() is a very common syscall for many applications, and -// t.CopyObjectOut has noticeable performance impact due to its many slice -// allocations and use of reflection. -func copyOutStat(t *kernel.Task, dst usermem.Addr, sattr fs.StableAttr, uattr fs.UnstableAttr) error { - b := t.CopyScratchBuffer(int(linux.SizeOfStat))[:0] - - // Dev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.DeviceID)) - // Ino (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(sattr.InodeID)) - // Mode (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, sattr.Type.LinuxType()|uint32(uattr.Perms.LinuxMode())) - // Nlink (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Links)) - // UID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - // GID (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow())) - // Rdev (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor))) - // Padding (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, 0) - // Size (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Size)) - // Blksize (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, uint32(sattr.BlockSize)) - // Padding (uint32) - b = binary.AppendUint32(b, usermem.ByteOrder, 0) - // Blocks (uint64) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(uattr.Usage/512)) - - // ATime - atime := uattr.AccessTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(atime.Nsec)) - - // MTime - mtime := uattr.ModificationTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(mtime.Nsec)) - - // CTime - ctime := uattr.StatusChangeTime.Timespec() - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Sec)) - b = binary.AppendUint64(b, usermem.ByteOrder, uint64(ctime.Nsec)) - - _, err := t.CopyOutBytes(dst, b) - return err +// LINT.IfChange + +func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { + return linux.Stat{ + Dev: sattr.DeviceID, + Ino: sattr.InodeID, + Nlink: uint32(uattr.Links), + Mode: sattr.Type.LinuxType() | uint32(uattr.Perms.LinuxMode()), + UID: uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()), + GID: uint32(uattr.Owner.GID.In(t.UserNamespace()).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(sattr.DeviceFileMajor, sattr.DeviceFileMinor)), + Size: uattr.Size, + Blksize: int32(sattr.BlockSize), + Blocks: uattr.Usage / 512, + ATime: uattr.AccessTime.Timespec(), + MTime: uattr.ModificationTime.Timespec(), + CTime: uattr.StatusChangeTime.Timespec(), + } } + +// LINT.ThenChange(vfs2/stat_arm64.go) diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 3e55235bd..5ad465ae3 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // Sync implements linux system call sync(2). func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { t.MountNamespace().SyncAll(t) @@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } + +// LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 0c9e2255d..00915fdde 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" @@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user defer root.DecRef() var wd *fs.Dirent - var executable *fs.File + var executable fsbridge.File var closeOnExec bool if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { // Even if the pathname is absolute, we may still need the wd @@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { - executable = f + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil { + return 0, nil, err + } + executable = fsbridge.NewFSFile(f) } else { wd = f.Dirent wd.IncRef() @@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user // Load the new TaskContext. remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ - Mounts: t.MountNamespace(), - Root: root, - WorkingDirectory: wd, + Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd), RemainingTraversals: &remainingTraversals, ResolveFinal: resolveFinal, Filename: pathname, diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index 432351917..a4c400f87 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -146,7 +146,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - return uintptr(id), nil, nil + return 0, nil, nil } // TimerSettime implements linux syscall timer_settime(2). diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index aba892939..506ee54ce 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // EventMaskWrite contains events that can be triggered on writes. // @@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) ( return total, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index efb95555c..2de5e3422 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // GetXattr implements linux syscall getxattr(2). func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getXattrFromPath(t, args, true) @@ -49,14 +51,11 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } defer f.DecRef() - n, value, err := getXattr(t, f.Dirent, nameAddr, size) + n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size) if err != nil { return 0, nil, err } - if _, err := t.CopyOutBytes(valueAddr, []byte(value)); err != nil { - return 0, nil, err - } return uintptr(n), nil, nil } @@ -71,41 +70,36 @@ func getXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink return 0, nil, err } - valueLen := 0 - err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + n := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } - n, value, err := getXattr(t, d, nameAddr, size) - valueLen = n - if err != nil { - return err - } - - _, err = t.CopyOutBytes(valueAddr, []byte(value)) + n, err = getXattr(t, d, nameAddr, valueAddr, size) return err }) if err != nil { return 0, nil, err } - return uintptr(valueLen), nil, nil + + return uintptr(n), nil, nil } // getXattr implements getxattr(2) from the given *fs.Dirent. -func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) (int, string, error) { - if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil { - return 0, "", err - } - +func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, size uint64) (int, error) { name, err := copyInXattrName(t, nameAddr) if err != nil { - return 0, "", err + return 0, err + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil { + return 0, err } // TODO(b/148380782): Support xattrs in namespaces other than "user". if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return 0, "", syserror.EOPNOTSUPP + return 0, syserror.EOPNOTSUPP } // If getxattr(2) is called with size 0, the size of the value will be @@ -118,18 +112,22 @@ func getXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr, size uint64) value, err := d.Inode.GetXattr(t, name, requestedSize) if err != nil { - return 0, "", err + return 0, err } n := len(value) if uint64(n) > requestedSize { - return 0, "", syserror.ERANGE + return 0, syserror.ERANGE } // Don't copy out the attribute value if size is 0. if size == 0 { - return n, "", nil + return n, nil } - return n, value, nil + + if _, err = t.CopyOutBytes(valueAddr, []byte(value)); err != nil { + return 0, err + } + return n, nil } // SetXattr implements linux syscall setxattr(2). @@ -172,7 +170,7 @@ func setXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink return 0, nil, err } - return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { if dirPath && !fs.IsDir(d.Inode.StableAttr) { return syserror.ENOTDIR } @@ -187,12 +185,12 @@ func setXattr(t *kernel.Task, d *fs.Dirent, nameAddr, valueAddr usermem.Addr, si return syserror.EINVAL } - if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + name, err := copyInXattrName(t, nameAddr) + if err != nil { return err } - name, err := copyInXattrName(t, nameAddr) - if err != nil { + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { return err } @@ -226,12 +224,18 @@ func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { return name, nil } +// Restrict xattrs to regular files and directories. +// +// TODO(b/148380782): In Linux, this restriction technically only applies to +// xattrs in the "user.*" namespace. Make file type checks specific to the +// namespace once we allow other xattr prefixes. +func xattrFileTypeOk(i *fs.Inode) bool { + return fs.IsRegular(i.StableAttr) || fs.IsDir(i.StableAttr) +} + func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error { // Restrict xattrs to regular files and directories. - // - // In Linux, this restriction technically only applies to xattrs in the - // "user.*" namespace, but we don't allow any other xattr prefixes anyway. - if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) { + if !xattrFileTypeOk(i) { if perms.Write { return syserror.EPERM } @@ -240,3 +244,181 @@ func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error return i.CheckPermission(t, perms) } + +// ListXattr implements linux syscall listxattr(2). +func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listXattrFromPath(t, args, true) +} + +// LListXattr implements linux syscall llistxattr(2). +func LListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listXattrFromPath(t, args, false) +} + +// FListXattr implements linux syscall flistxattr(2). +func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := uint64(args[2].SizeT()) + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + n, err := listXattr(t, f.Dirent, listAddr, size) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func listXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := uint64(args[2].SizeT()) + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + n := 0 + err = fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + n, err = listXattr(t, d, listAddr, size) + return err + }) + if err != nil { + return 0, nil, err + } + + return uintptr(n), nil, nil +} + +func listXattr(t *kernel.Task, d *fs.Dirent, addr usermem.Addr, size uint64) (int, error) { + if !xattrFileTypeOk(d.Inode) { + return 0, nil + } + + // If listxattr(2) is called with size 0, the buffer size needed to contain + // the xattr list will be returned successfully even if it is nonzero. In + // that case, we need to retrieve the entire list so we can compute and + // return the correct size. + requestedSize := size + if size == 0 || size > linux.XATTR_SIZE_MAX { + requestedSize = linux.XATTR_SIZE_MAX + } + xattrs, err := d.Inode.ListXattr(t, requestedSize) + if err != nil { + return 0, err + } + + // TODO(b/148380782): support namespaces other than "user". + for x := range xattrs { + if !strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + delete(xattrs, x) + } + } + + listSize := xattrListSize(xattrs) + if listSize > linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + if uint64(listSize) > requestedSize { + return 0, syserror.ERANGE + } + + // Don't copy out the attributes if size is 0. + if size == 0 { + return listSize, nil + } + + buf := make([]byte, 0, listSize) + for x := range xattrs { + buf = append(buf, []byte(x)...) + buf = append(buf, 0) + } + if _, err := t.CopyOutBytes(addr, buf); err != nil { + return 0, err + } + + return len(buf), nil +} + +func xattrListSize(xattrs map[string]struct{}) int { + size := 0 + for x := range xattrs { + size += len(x) + 1 + } + return size +} + +// RemoveXattr implements linux syscall removexattr(2). +func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return removeXattrFromPath(t, args, true) +} + +// LRemoveXattr implements linux syscall lremovexattr(2). +func LRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return removeXattrFromPath(t, args, false) +} + +// FRemoveXattr implements linux syscall fremovexattr(2). +func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + // TODO(b/113957122): Return EBADF if the fd was opened with O_PATH. + f := t.GetFile(fd) + if f == nil { + return 0, nil, syserror.EBADF + } + defer f.DecRef() + + return 0, nil, removeXattr(t, f.Dirent, nameAddr) +} + +func removeXattrFromPath(t *kernel.Task, args arch.SyscallArguments, resolveSymlink bool) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */) + if err != nil { + return 0, nil, err + } + + return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, resolveSymlink, func(_ *fs.Dirent, d *fs.Dirent, _ uint) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + + return removeXattr(t, d, nameAddr) + }) +} + +// removeXattr implements removexattr(2) from the given *fs.Dirent. +func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error { + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil { + return err + } + + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + + return d.Inode.RemoveXattr(t, d, name) +} + +// LINT.ThenChange(vfs2/xattr.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 6b8a00b6e..e7695e995 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -5,18 +5,46 @@ package(licenses = ["notice"]) go_library( name = "vfs2", srcs = [ + "epoll.go", + "epoll_unsafe.go", + "execve.go", + "fd.go", + "filesystem.go", + "fscontext.go", + "getdents.go", + "ioctl.go", "linux64.go", "linux64_override_amd64.go", "linux64_override_arm64.go", - "sys_read.go", + "mmap.go", + "path.go", + "poll.go", + "read_write.go", + "setstat.go", + "stat.go", + "stat_amd64.go", + "stat_arm64.go", + "sync.go", + "xattr.go", ], + marshal = True, visibility = ["//:sandbox"], deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/gohacks", "//pkg/sentry/arch", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", "//pkg/sentry/syscalls", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go new file mode 100644 index 000000000..d6cb0e79a --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -0,0 +1,225 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "math" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EpollCreate1 implements Linux syscall epoll_create1(2). +func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^linux.EPOLL_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCreate implements Linux syscall epoll_create(2). +func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + + // "Since Linux 2.6.8, the size argument is ignored, but must be greater + // than zero" - epoll_create(2) + if size <= 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCtl implements Linux syscall epoll_ctl(2). +func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + op := args[1].Int() + fd := args[2].Int() + eventAddr := args[3].Pointer() + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + if epfile == file { + return 0, nil, syserror.EINVAL + } + + var event linux.EpollEvent + switch op { + case linux.EPOLL_CTL_ADD: + if err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.AddInterest(file, fd, event) + case linux.EPOLL_CTL_DEL: + return 0, nil, ep.DeleteInterest(file, fd) + case linux.EPOLL_CTL_MOD: + if err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.ModifyInterest(file, fd, event) + default: + return 0, nil, syserror.EINVAL + } +} + +// EpollWait implements Linux syscall epoll_wait(2). +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeout := int(args[3].Int()) + + const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS + if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { + return 0, nil, syserror.EINVAL + } + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + + // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent, + // maxEvents), so that the buffer can be allocated on the stack. + var ( + events [16]linux.EpollEvent + total int + ch chan struct{} + haveDeadline bool + deadline ktime.Time + ) + for { + batchEvents := len(events) + if batchEvents > maxEvents { + batchEvents = maxEvents + } + n := ep.ReadEvents(events[:batchEvents]) + maxEvents -= n + if n != 0 { + // Copy what we read out. + copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n]) + eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent) + total += copiedEvents + if err != nil { + if total != 0 { + return uintptr(total), nil, nil + } + return 0, nil, err + } + // If we've filled the application's event buffer, we're done. + if maxEvents == 0 { + return uintptr(total), nil, nil + } + // Loop if we read a full batch, under the expectation that there + // may be more events to read. + if n == batchEvents { + continue + } + } + // We get here if n != batchEvents. If we read any number of events + // (just now, or in a previous iteration of this loop), or if timeout + // is 0 (such that epoll_wait should be non-blocking), return the + // events we've read so far to the application. + if total != 0 || timeout == 0 { + return uintptr(total), nil, nil + } + // In the first iteration of this loop, register with the epoll + // instance for readability events, but then immediately continue the + // loop since we need to retry ReadEvents() before blocking. In all + // subsequent iterations, block until events are available, the timeout + // expires, or an interrupt arrives. + if ch == nil { + var w waiter.Entry + w, ch = waiter.NewChannelEntry(nil) + epfile.EventRegister(&w, waiter.EventIn) + defer epfile.EventUnregister(&w) + } else { + // Set up the timer if a timeout was specified. + if timeout > 0 && !haveDeadline { + timeoutDur := time.Duration(timeout) * time.Millisecond + deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) + haveDeadline = true + } + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + // total must be 0 since otherwise we would have returned + // above. + return 0, nil, err + } + } + } +} + +// EpollPwait implements Linux syscall epoll_pwait(2). +func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + return EpollWait(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go new file mode 100644 index 000000000..825f325bf --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go @@ -0,0 +1,44 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "reflect" + "runtime" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{})) + +func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) { + if len(events) == 0 { + return 0, nil + } + // Cast events to a byte slice for copying. + var eventBytes []byte + eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes)) + eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0]))) + eventBytesHdr.Len = len(events) * sizeofEpollEvent + eventBytesHdr.Cap = len(events) * sizeofEpollEvent + copiedBytes, err := t.CopyOutBytes(addr, eventBytes) + runtime.KeepAlive(events) + copiedEvents := copiedBytes / sizeofEpollEvent // rounded down + return copiedEvents, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go new file mode 100644 index 000000000..aef0078a8 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -0,0 +1,137 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/loader" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Execve implements linux syscall execve(2). +func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathnameAddr := args[0].Pointer() + argvAddr := args[1].Pointer() + envvAddr := args[2].Pointer() + return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */) +} + +// Execveat implements linux syscall execveat(2). +func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathnameAddr := args[1].Pointer() + argvAddr := args[2].Pointer() + envvAddr := args[3].Pointer() + flags := args[4].Int() + return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags) +} + +func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) + if err != nil { + return 0, nil, err + } + var argv, envv []string + if argvAddr != 0 { + var err error + argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + if envvAddr != 0 { + var err error + envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + var executable fsbridge.File + closeOnExec := false + if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { + // We must open the executable ourselves since dirfd is used as the + // starting point while resolving path, but the task working directory + // is used as the starting point while resolving interpreters (Linux: + // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() => + // do_open_execat(fd=AT_FDCWD)), and the loader package is currently + // incapable of handling this correctly. + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + start := dirfile.VirtualDentry() + start.IncRef() + dirfile.DecRef() + closeOnExec = dirfileFlags.CloseOnExec + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + }) + start.DecRef() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + executable = fsbridge.NewVFSFile(file) + } + + // Load the new TaskContext. + mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change + defer mntns.DecRef() + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + remainingTraversals := uint(linux.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Opener: fsbridge.NewVFSLookup(mntns, root, wd), + RemainingTraversals: &remainingTraversals, + ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + Filename: pathname, + File: executable, + CloseOnExec: closeOnExec, + Argv: argv, + Envv: envv, + Features: t.Arch().FeatureSet(), + } + + tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + if se != nil { + return 0, nil, se.ToError() + } + + ctrl, err := t.Execve(tc) + return 0, ctrl, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go new file mode 100644 index 000000000..3afcea665 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -0,0 +1,147 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Close implements Linux syscall close(2). +func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + _, file := t.FDTable().Remove(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.OnClose(t) + return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) +} + +// Dup implements Linux syscall dup(2). +func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, syserror.EMFILE + } + return uintptr(newFD), nil, nil +} + +// Dup2 implements Linux syscall dup2(2). +func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + + if oldfd == newfd { + // As long as oldfd is valid, dup2() does nothing and returns newfd. + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + file.DecRef() + return uintptr(newfd), nil, nil + } + + return dup3(t, oldfd, newfd, 0) +} + +// Dup3 implements Linux syscall dup3(2). +func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + flags := args[2].Uint() + + if oldfd == newfd { + return 0, nil, syserror.EINVAL + } + + return dup3(t, oldfd, newfd, flags) +} + +func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { + if flags&^linux.O_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(newfd), nil, nil +} + +// Fcntl implements linux syscall fcntl(2). +func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + cmd := args[1].Int() + + file, flags := t.FDTable().GetVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: + minfd := args[2].Int() + fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil + case linux.F_GETFD: + return uintptr(flags.ToLinuxFDFlags()), nil, nil + case linux.F_SETFD: + flags := args[2].Uint() + t.FDTable().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: flags&linux.FD_CLOEXEC != 0, + }) + return 0, nil, nil + case linux.F_GETFL: + return uintptr(file.StatusFlags()), nil, nil + case linux.F_SETFL: + return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + default: + // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go new file mode 100644 index 000000000..fc5ceea4c --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -0,0 +1,326 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Link implements Linux syscall link(2). +func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Linkat implements Linux syscall linkat(2). +func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Int() + return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { + return syserror.EINVAL + } + if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return syserror.ENOENT + } + + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) +} + +// Mkdir implements Linux syscall mkdir(2). +func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) +} + +// Mkdirat implements Linux syscall mkdirat(2). +func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, mkdirat(t, dirfd, addr, mode) +} + +func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ + Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) +} + +// Mknod implements Linux syscall mknod(2). +func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + dev := args[2].Uint() + return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev) +} + +// Mknodat implements Linux syscall mknodat(2). +func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + dev := args[3].Uint() + return 0, nil, mknodat(t, dirfd, addr, mode, dev) +} + +func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + major, minor := linux.DecodeDeviceID(dev) + return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ + Mode: linux.FileMode(mode &^ t.FSContext().Umask()), + DevMajor: uint32(major), + DevMinor: minor, + }) +} + +// Open implements Linux syscall open(2). +func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + mode := args[2].ModeT() + return openat(t, linux.AT_FDCWD, addr, flags, mode) +} + +// Openat implements Linux syscall openat(2). +func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + flags := args[2].Uint() + mode := args[3].ModeT() + return openat(t, dirfd, addr, flags, mode) +} + +// Creat implements Linux syscall creat(2). +func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) +} + +func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ + Flags: flags, + Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + return uintptr(fd), nil, err +} + +// Rename implements Linux syscall rename(2). +func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Renameat implements Linux syscall renameat(2). +func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) +} + +// Renameat2 implements Linux syscall renameat2(2). +func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Uint() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error { + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ + Flags: flags, + }) +} + +// Rmdir implements Linux syscall rmdir(2). +func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) +} + +func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) +} + +// Unlink implements Linux syscall unlink(2). +func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) +} + +func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) +} + +// Unlinkat implements Linux syscall unlinkat(2). +func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + + if flags&^linux.AT_REMOVEDIR != 0 { + return 0, nil, syserror.EINVAL + } + + if flags&linux.AT_REMOVEDIR != 0 { + return 0, nil, rmdirat(t, dirfd, pathAddr) + } + return 0, nil, unlinkat(t, dirfd, pathAddr) +} + +// Symlink implements Linux syscall symlink(2). +func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + linkpathAddr := args[1].Pointer() + return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) +} + +// Symlinkat implements Linux syscall symlinkat(2). +func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + newdirfd := args[1].Int() + linkpathAddr := args[2].Pointer() + return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) +} + +func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error { + target, err := t.CopyInString(targetAddr, linux.PATH_MAX) + if err != nil { + return err + } + linkpath, err := copyInPath(t, linkpathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go new file mode 100644 index 000000000..317409a18 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getcwd implements Linux syscall getcwd(2). +func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + size := args[1].SizeT() + + root := t.FSContext().RootDirectoryVFS2() + wd := t.FSContext().WorkingDirectoryVFS2() + s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) + root.DecRef() + wd.DecRef() + if err != nil { + return 0, nil, err + } + + // Note this is >= because we need a terminator. + if uint(len(s)) >= size { + return 0, nil, syserror.ERANGE + } + + // Construct a byte slice containing a NUL terminator. + buf := t.CopyScratchBuffer(len(s) + 1) + copy(buf, s) + buf[len(buf)-1] = 0 + + // Write the pathname slice. + n, err := t.CopyOutBytes(addr, buf) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Chdir implements Linux syscall chdir(2). +func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Fchdir implements Linux syscall fchdir(2). +func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Chroot implements Linux syscall chroot(2). +func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + if !t.HasCapability(linux.CAP_SYS_CHROOT) { + return 0, nil, syserror.EPERM + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetRootDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go new file mode 100644 index 000000000..ddc140b65 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -0,0 +1,149 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Getdents implements Linux syscall getdents(2). +func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, false /* isGetdents64 */) +} + +// Getdents64 implements Linux syscall getdents64(2). +func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, true /* isGetdents64 */) +} + +func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := int(args[2].Uint()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + cb := getGetdentsCallback(t, addr, size, isGetdents64) + err := file.IterDirents(t, cb) + n := size - cb.remaining + putGetdentsCallback(cb) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +type getdentsCallback struct { + t *kernel.Task + addr usermem.Addr + remaining int + isGetdents64 bool +} + +var getdentsCallbackPool = sync.Pool{ + New: func() interface{} { + return &getdentsCallback{} + }, +} + +func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback { + cb := getdentsCallbackPool.Get().(*getdentsCallback) + *cb = getdentsCallback{ + t: t, + addr: addr, + remaining: size, + isGetdents64: isGetdents64, + } + return cb +} + +func putGetdentsCallback(cb *getdentsCallback) { + cb.t = nil + getdentsCallbackPool.Put(cb) +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { + var buf []byte + if cb.isGetdents64 { + // struct linux_dirent64 { + // ino64_t d_ino; /* 64-bit inode number */ + // off64_t d_off; /* 64-bit offset to next structure */ + // unsigned short d_reclen; /* Size of this dirent */ + // unsigned char d_type; /* File type */ + // char d_name[]; /* Filename (null-terminated) */ + // }; + size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) + if size < cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + buf[18] = dirent.Type + copy(buf[19:], dirent.Name) + buf[size-1] = 0 // NUL terminator + } else { + // struct linux_dirent { + // unsigned long d_ino; /* Inode number */ + // unsigned long d_off; /* Offset to next linux_dirent */ + // unsigned short d_reclen; /* Length of this linux_dirent */ + // char d_name[]; /* Filename (null-terminated) */ + // /* length is actually (d_reclen - 2 - + // offsetof(struct linux_dirent, d_name)) */ + // /* + // char pad; // Zero padding byte + // char d_type; // File type (only since Linux + // // 2.6.4); offset is (d_reclen - 1) + // */ + // }; + if cb.t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) + } + size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name) + if size < cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + copy(buf[18:], dirent.Name) + buf[size-3] = 0 // NUL terminator + buf[size-2] = 0 // zero padding byte + buf[size-1] = dirent.Type + } + n, err := cb.t.CopyOutBytes(cb.addr, buf) + if err != nil { + // Don't report partially-written dirents by advancing cb.addr or + // cb.remaining. + return err + } + cb.addr += usermem.Addr(n) + cb.remaining -= n + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go new file mode 100644 index 000000000..5a2418da9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Ioctl implements Linux syscall ioctl(2). +func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + ret, err := file.Ioctl(t, t.MemoryManager(), args) + return ret, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index c134714ee..7d220bc20 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 + package vfs2 import ( @@ -22,4 +24,142 @@ import ( // Override syscall table to add syscalls implementations from this package. func Override(table map[uintptr]kernel.Syscall) { table[0] = syscalls.Supported("read", Read) + table[1] = syscalls.Supported("write", Write) + table[2] = syscalls.Supported("open", Open) + table[3] = syscalls.Supported("close", Close) + table[4] = syscalls.Supported("stat", Stat) + table[5] = syscalls.Supported("fstat", Fstat) + table[6] = syscalls.Supported("lstat", Lstat) + table[7] = syscalls.Supported("poll", Poll) + table[8] = syscalls.Supported("lseek", Lseek) + table[9] = syscalls.Supported("mmap", Mmap) + table[16] = syscalls.Supported("ioctl", Ioctl) + table[17] = syscalls.Supported("pread64", Pread64) + table[18] = syscalls.Supported("pwrite64", Pwrite64) + table[19] = syscalls.Supported("readv", Readv) + table[20] = syscalls.Supported("writev", Writev) + table[21] = syscalls.Supported("access", Access) + delete(table, 22) // pipe + table[23] = syscalls.Supported("select", Select) + table[32] = syscalls.Supported("dup", Dup) + table[33] = syscalls.Supported("dup2", Dup2) + delete(table, 40) // sendfile + delete(table, 41) // socket + delete(table, 42) // connect + delete(table, 43) // accept + delete(table, 44) // sendto + delete(table, 45) // recvfrom + delete(table, 46) // sendmsg + delete(table, 47) // recvmsg + delete(table, 48) // shutdown + delete(table, 49) // bind + delete(table, 50) // listen + delete(table, 51) // getsockname + delete(table, 52) // getpeername + delete(table, 53) // socketpair + delete(table, 54) // setsockopt + delete(table, 55) // getsockopt + table[59] = syscalls.Supported("execve", Execve) + table[72] = syscalls.Supported("fcntl", Fcntl) + delete(table, 73) // flock + table[74] = syscalls.Supported("fsync", Fsync) + table[75] = syscalls.Supported("fdatasync", Fdatasync) + table[76] = syscalls.Supported("truncate", Truncate) + table[77] = syscalls.Supported("ftruncate", Ftruncate) + table[78] = syscalls.Supported("getdents", Getdents) + table[79] = syscalls.Supported("getcwd", Getcwd) + table[80] = syscalls.Supported("chdir", Chdir) + table[81] = syscalls.Supported("fchdir", Fchdir) + table[82] = syscalls.Supported("rename", Rename) + table[83] = syscalls.Supported("mkdir", Mkdir) + table[84] = syscalls.Supported("rmdir", Rmdir) + table[85] = syscalls.Supported("creat", Creat) + table[86] = syscalls.Supported("link", Link) + table[87] = syscalls.Supported("unlink", Unlink) + table[88] = syscalls.Supported("symlink", Symlink) + table[89] = syscalls.Supported("readlink", Readlink) + table[90] = syscalls.Supported("chmod", Chmod) + table[91] = syscalls.Supported("fchmod", Fchmod) + table[92] = syscalls.Supported("chown", Chown) + table[93] = syscalls.Supported("fchown", Fchown) + table[94] = syscalls.Supported("lchown", Lchown) + table[132] = syscalls.Supported("utime", Utime) + table[133] = syscalls.Supported("mknod", Mknod) + table[137] = syscalls.Supported("statfs", Statfs) + table[138] = syscalls.Supported("fstatfs", Fstatfs) + table[161] = syscalls.Supported("chroot", Chroot) + table[162] = syscalls.Supported("sync", Sync) + delete(table, 165) // mount + delete(table, 166) // umount2 + delete(table, 187) // readahead + table[188] = syscalls.Supported("setxattr", Setxattr) + table[189] = syscalls.Supported("lsetxattr", Lsetxattr) + table[190] = syscalls.Supported("fsetxattr", Fsetxattr) + table[191] = syscalls.Supported("getxattr", Getxattr) + table[192] = syscalls.Supported("lgetxattr", Lgetxattr) + table[193] = syscalls.Supported("fgetxattr", Fgetxattr) + table[194] = syscalls.Supported("listxattr", Listxattr) + table[195] = syscalls.Supported("llistxattr", Llistxattr) + table[196] = syscalls.Supported("flistxattr", Flistxattr) + table[197] = syscalls.Supported("removexattr", Removexattr) + table[198] = syscalls.Supported("lremovexattr", Lremovexattr) + table[199] = syscalls.Supported("fremovexattr", Fremovexattr) + delete(table, 206) // io_setup + delete(table, 207) // io_destroy + delete(table, 208) // io_getevents + delete(table, 209) // io_submit + delete(table, 210) // io_cancel + table[213] = syscalls.Supported("epoll_create", EpollCreate) + table[217] = syscalls.Supported("getdents64", Getdents64) + delete(table, 221) // fdavise64 + table[232] = syscalls.Supported("epoll_wait", EpollWait) + table[233] = syscalls.Supported("epoll_ctl", EpollCtl) + table[235] = syscalls.Supported("utimes", Utimes) + delete(table, 253) // inotify_init + delete(table, 254) // inotify_add_watch + delete(table, 255) // inotify_rm_watch + table[257] = syscalls.Supported("openat", Openat) + table[258] = syscalls.Supported("mkdirat", Mkdirat) + table[259] = syscalls.Supported("mknodat", Mknodat) + table[260] = syscalls.Supported("fchownat", Fchownat) + table[261] = syscalls.Supported("futimens", Futimens) + table[262] = syscalls.Supported("newfstatat", Newfstatat) + table[263] = syscalls.Supported("unlinkat", Unlinkat) + table[264] = syscalls.Supported("renameat", Renameat) + table[265] = syscalls.Supported("linkat", Linkat) + table[266] = syscalls.Supported("symlinkat", Symlinkat) + table[267] = syscalls.Supported("readlinkat", Readlinkat) + table[268] = syscalls.Supported("fchmodat", Fchmodat) + table[269] = syscalls.Supported("faccessat", Faccessat) + table[270] = syscalls.Supported("pselect", Pselect) + table[271] = syscalls.Supported("ppoll", Ppoll) + delete(table, 275) // splice + delete(table, 276) // tee + table[277] = syscalls.Supported("sync_file_range", SyncFileRange) + table[280] = syscalls.Supported("utimensat", Utimensat) + table[281] = syscalls.Supported("epoll_pwait", EpollPwait) + delete(table, 282) // signalfd + delete(table, 283) // timerfd_create + delete(table, 284) // eventfd + delete(table, 285) // fallocate + delete(table, 286) // timerfd_settime + delete(table, 287) // timerfd_gettime + delete(table, 288) // accept4 + delete(table, 289) // signalfd4 + delete(table, 290) // eventfd2 + table[291] = syscalls.Supported("epoll_create1", EpollCreate1) + table[292] = syscalls.Supported("dup3", Dup3) + delete(table, 293) // pipe2 + delete(table, 294) // inotify_init1 + table[295] = syscalls.Supported("preadv", Preadv) + table[296] = syscalls.Supported("pwritev", Pwritev) + delete(table, 299) // recvmmsg + table[306] = syscalls.Supported("syncfs", Syncfs) + delete(table, 307) // sendmmsg + table[316] = syscalls.Supported("renameat2", Renameat2) + delete(table, 319) // memfd_create + table[322] = syscalls.Supported("execveat", Execveat) + table[327] = syscalls.Supported("preadv2", Preadv2) + table[328] = syscalls.Supported("pwritev2", Pwritev2) + table[332] = syscalls.Supported("statx", Statx) } diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go index 6af5c400f..a6b367468 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package vfs2 import ( diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go new file mode 100644 index 000000000..60a43f0a0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -0,0 +1,92 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mmap implements Linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := args[4].Int() + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + map32bit := flags&linux.MAP_32BIT != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Map32Bit: map32bit, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // mmap unconditionally requires that the FD is readable. + if !file.IsReadable() { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !file.IsWritable() { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go new file mode 100644 index 000000000..97da6c647 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -0,0 +1,94 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) { + pathname, err := t.CopyInString(addr, linux.PATH_MAX) + if err != nil { + return fspath.Path{}, err + } + return fspath.Parse(pathname), nil +} + +type taskPathOperation struct { + pop vfs.PathOperation + haveStartRef bool +} + +func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) { + root := t.FSContext().RootDirectoryVFS2() + start := root + haveStartRef := false + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + root.DecRef() + return taskPathOperation{}, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + haveStartRef = true + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + root.DecRef() + return taskPathOperation{}, syserror.EBADF + } + start = dirfile.VirtualDentry() + start.IncRef() + haveStartRef = true + dirfile.DecRef() + } + } + return taskPathOperation{ + pop: vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, + haveStartRef: haveStartRef, + }, nil +} + +func (tpop *taskPathOperation) Release() { + tpop.pop.Root.DecRef() + if tpop.haveStartRef { + tpop.pop.Start.DecRef() + tpop.haveStartRef = false + } +} + +type shouldAllowEmptyPath bool + +const ( + disallowEmptyPath shouldAllowEmptyPath = false + allowEmptyPath shouldAllowEmptyPath = true +) + +type shouldFollowFinalSymlink bool + +const ( + nofollowFinalSymlink shouldFollowFinalSymlink = false + followFinalSymlink shouldFollowFinalSymlink = true +) diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go new file mode 100644 index 000000000..dbf4882da --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -0,0 +1,584 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileCap is the maximum allowable files for poll & select. This has no +// equivalent in Linux; it exists in gVisor since allocation failure in Go is +// unrecoverable. +const fileCap = 1024 * 1024 + +// Masks for "readable", "writable", and "exceptional" events as defined by +// select(2). +const ( + // selectReadEvents is analogous to the Linux kernel's + // fs/select.c:POLLIN_SET. + selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR + + // selectWriteEvents is analogous to the Linux kernel's + // fs/select.c:POLLOUT_SET. + selectWriteEvents = linux.POLLOUT | linux.POLLERR + + // selectExceptEvents is analogous to the Linux kernel's + // fs/select.c:POLLEX_SET. + selectExceptEvents = linux.POLLPRI +) + +// pollState tracks the associated file description and waiter of a PollFD. +type pollState struct { + file *vfs.FileDescription + waiter waiter.Entry +} + +// initReadiness gets the current ready mask for the file represented by the FD +// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is +// used to register with the file for event notifications, and a reference to +// the file is stored in "state". +func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) { + if pfd.FD < 0 { + pfd.REvents = 0 + return + } + + file := t.GetFileVFS2(pfd.FD) + if file == nil { + pfd.REvents = linux.POLLNVAL + return + } + + if ch == nil { + defer file.DecRef() + } else { + state.file = file + state.waiter, _ = waiter.NewChannelEntry(ch) + file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events))) + } + + r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) + pfd.REvents = int16(r.ToLinux()) & pfd.Events +} + +// releaseState releases all the pollState in "state". +func releaseState(state []pollState) { + for i := range state { + if state[i].file != nil { + state[i].file.EventUnregister(&state[i].waiter) + state[i].file.DecRef() + } + } +} + +// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" +// when "timeout" is greater than zero. +// +// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or +// positive if interrupted by a signal. +func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { + var ch chan struct{} + if timeout != 0 { + ch = make(chan struct{}, 1) + } + + // Register for event notification in the files involved if we may + // block (timeout not zero). Once we find a file that has a non-zero + // result, we stop registering for events but still go through all files + // to get their ready masks. + state := make([]pollState, len(pfd)) + defer releaseState(state) + n := uintptr(0) + for i := range pfd { + initReadiness(t, &pfd[i], &state[i], ch) + if pfd[i].REvents != 0 { + n++ + ch = nil + } + } + + if timeout == 0 { + return timeout, n, nil + } + + haveTimeout := timeout >= 0 + + for n == 0 { + var err error + // Wait for a notification. + timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) + if err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + return timeout, 0, err + } + + // We got notified, count how many files are ready. If none, + // then this was a spurious notification, and we just go back + // to sleep with the remaining timeout. + for i := range state { + if state[i].file == nil { + continue + } + + r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) + rl := int16(r.ToLinux()) & pfd[i].Events + if rl != 0 { + pfd[i].REvents = rl + n++ + } + } + } + + return timeout, n, nil +} + +// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. +func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) { + if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return nil, syserror.EINVAL + } + + pfd := make([]linux.PollFD, nfds) + if nfds > 0 { + if _, err := t.CopyIn(addr, &pfd); err != nil { + return nil, err + } + } + + return pfd, nil +} + +func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { + pfd, err := copyInPollFDs(t, addr, nfds) + if err != nil { + return timeout, 0, err + } + + // Compatibility warning: Linux adds POLLHUP and POLLERR just before + // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after + // polling, changing event masks here is an application-visible difference. + // (Linux also doesn't copy out event masks at all, only revents.) + for i := range pfd { + pfd[i].Events |= linux.POLLHUP | linux.POLLERR + } + remainingTimeout, n, err := pollBlock(t, pfd, timeout) + err = syserror.ConvertIntr(err, syserror.EINTR) + + // The poll entries are copied out regardless of whether + // any are set or not. This aligns with the Linux behavior. + if nfds > 0 && err == nil { + if _, err := t.CopyOut(addr, pfd); err != nil { + return remainingTimeout, 0, err + } + } + + return remainingTimeout, n, err +} + +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + +func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { + if nfds < 0 || nfds > fileCap { + return 0, syserror.EINVAL + } + + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + + // Count how many FDs are actually being requested so that we can build + // a PollFD array. + fdCount := 0 + for i := 0; i < nBytes; i++ { + v := r[i] | w[i] | e[i] + for v != 0 { + v &= (v - 1) + fdCount++ + } + } + + // Build the PollFD array. + pfd := make([]linux.PollFD, 0, fdCount) + var fd int32 + for i := 0; i < nBytes; i++ { + rV, wV, eV := r[i], w[i], e[i] + v := rV | wV | eV + m := byte(1) + for j := 0; j < 8; j++ { + if (v & m) != 0 { + // Make sure the fd is valid and decrement the reference + // immediately to ensure we don't leak. Note, another thread + // might be about to close fd. This is racy, but that's + // OK. Linux is racy in the same way. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + file.DecRef() + + var mask int16 + if (rV & m) != 0 { + mask |= selectReadEvents + } + + if (wV & m) != 0 { + mask |= selectWriteEvents + } + + if (eV & m) != 0 { + mask |= selectExceptEvents + } + + pfd = append(pfd, linux.PollFD{ + FD: fd, + Events: mask, + }) + } + + fd++ + m <<= 1 + } + } + + // Do the syscall, then count the number of bits set. + if _, _, err = pollBlock(t, pfd, timeout); err != nil { + return 0, syserror.ConvertIntr(err, syserror.EINTR) + } + + // r, w, and e are currently event mask bitsets; unset bits corresponding + // to events that *didn't* occur. + bitSetCount := uintptr(0) + for idx := range pfd { + events := pfd[idx].REvents + i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) + m := byte(1) << j + if r[i]&m != 0 { + if (events & selectReadEvents) != 0 { + bitSetCount++ + } else { + r[i] &^= m + } + } + if w[i]&m != 0 { + if (events & selectWriteEvents) != 0 { + bitSetCount++ + } else { + w[i] &^= m + } + } + if e[i]&m != 0 { + if (events & selectExceptEvents) != 0 { + bitSetCount++ + } else { + e[i] &^= m + } + } + } + + // Copy updated vectors back. + if readFDs != 0 { + if _, err := t.CopyOut(readFDs, r); err != nil { + return 0, err + } + } + + if writeFDs != 0 { + if _, err := t.CopyOut(writeFDs, w); err != nil { + return 0, err + } + } + + if exceptFDs != 0 { + if _, err := t.CopyOut(exceptFDs, e); err != nil { + return 0, err + } + } + + return bitSetCount, nil +} + +// timeoutRemaining returns the amount of time remaining for the specified +// timeout or 0 if it has elapsed. +// +// startNs must be from CLOCK_MONOTONIC. +func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { + now := t.Kernel().MonotonicClock().Now() + remaining := timeout - now.Sub(startNs) + if remaining < 0 { + remaining = 0 + } + return remaining +} + +// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) + return tsRemaining.CopyOut(t, timespecAddr) +} + +// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) + return tvRemaining.CopyOut(t, timevalAddr) +} + +// pollRestartBlock encapsulates the state required to restart poll(2) via +// restart_syscall(2). +// +// +stateify savable +type pollRestartBlock struct { + pfdAddr usermem.Addr + nfds uint + timeout time.Duration +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return poll(t, p.pfdAddr, p.nfds, p.timeout) +} + +func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) { + remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) + // On an interrupt poll(2) is restarted with the remaining timeout. + if err == syserror.EINTR { + t.SetSyscallRestartBlock(&pollRestartBlock{ + pfdAddr: pfdAddr, + nfds: nfds, + timeout: remainingTimeout, + }) + return 0, kernel.ERESTART_RESTARTBLOCK + } + return n, err +} + +// Poll implements linux syscall poll(2). +func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timeout := time.Duration(args[2].Int()) * time.Millisecond + n, err := poll(t, pfdAddr, nfds, timeout) + return n, nil, err +} + +// Ppoll implements linux syscall ppoll(2). +func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timespecAddr := args[2].Pointer() + maskAddr := args[3].Pointer() + maskSize := uint(args[4].Uint()) + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + _, n, err := doPoll(t, pfdAddr, nfds, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // doPoll returns EINTR if interrupted, but ppoll is normally restartable + // if interrupted by something other than a signal handled by the + // application (i.e. returns ERESTARTNOHAND). However, if + // copyOutTimespecRemaining failed, then the restarted ppoll would use the + // wrong timeout, so the error should be left as EINTR. + // + // Note that this means that if err is nil but copyErr is not, copyErr is + // ignored. This is consistent with Linux. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Select implements linux syscall select(2). +func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timevalAddr := args[4].Pointer() + + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timevalAddr != 0 { + var timeval linux.Timeval + if err := timeval.CopyIn(t, timevalAddr); err != nil { + return 0, nil, err + } + if timeval.Sec < 0 || timeval.Usec < 0 { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(timeval.ToNsecCapped()) + } + startNs := t.Kernel().MonotonicClock().Now() + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Pselect implements linux syscall pselect(2). +func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timespecAddr := args[4].Pointer() + maskWithSizeAddr := args[5].Pointer() + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskWithSizeAddr != 0 { + if t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) + } + var maskStruct sigSetWithSize + if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { + return 0, nil, err + } + if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { + return 0, nil, err + } + } + + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// +marshal +type sigSetWithSize struct { + sigsetAddr uint64 + sizeofSigset uint64 +} + +// copyTimespecInToDuration copies a Timespec from the untrusted app range, +// validates it and converts it to a Duration. +// +// If the Timespec is larger than what can be represented in a Duration, the +// returned value is the maximum that Duration will allow. +// +// If timespecAddr is NULL, the returned value is negative. +func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) { + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timespecAddr != 0 { + var timespec linux.Timespec + if err := timespec.CopyIn(t, timespecAddr); err != nil { + return 0, err + } + if !timespec.Valid() { + return 0, syserror.EINVAL + } + timeout = time.Duration(timespec.ToNsecCapped()) + } + return timeout, nil +} + +func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error { + if maskAddr == 0 { + return nil + } + if maskSize != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if err := mask.CopyIn(t, maskAddr); err != nil { + return err + } + mask &^= kernel.UnblockableSignals + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go new file mode 100644 index 000000000..35f6308d6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -0,0 +1,511 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + eventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr + eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr +) + +// Read implements Linux syscall read(2). +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +// Readv implements Linux syscall readv(2). +func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Pread64 implements Linux syscall pread64(2). +func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) +} + +// Preadv implements Linux syscall preadv(2). +func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) +} + +// Preadv2 implements Linux syscall preadv2(2). +func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.ReadOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = read(t, file, dst, opts) + } else { + n, err = pread(t, file, dst, offset, opts) + } + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) +} + +func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + n, err := file.PRead(t, dst, offset, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.PRead(t, dst, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Write implements Linux syscall write(2). +func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file) +} + +// Writev implements Linux syscall writev(2). +func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) +} + +func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := file.Write(t, src, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Write(t, src, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Pwrite64 implements Linux syscall pwrite64(2). +func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) +} + +// Pwritev implements Linux syscall pwritev(2). +func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) +} + +// Pwritev2 implements Linux syscall pwritev2(2). +func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.WriteOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = write(t, file, src, opts) + } else { + n, err = pwrite(t, file, src, offset, opts) + } + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) +} + +func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, err := file.PWrite(t, src, offset, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.PWrite(t, src, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Lseek implements Linux syscall lseek(2). +func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + whence := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newoff, err := file.Seek(t, offset, whence) + return uintptr(newoff), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go new file mode 100644 index 000000000..9250659ff --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -0,0 +1,380 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + +// Chmod implements Linux syscall chmod(2). +func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) +} + +// Fchmodat implements Linux syscall fchmodat(2). +func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, fchmodat(t, dirfd, pathAddr, mode) +} + +func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Fchmod implements Linux syscall fchmod(2). +func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].ModeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Chown implements Linux syscall chown(2). +func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) +} + +// Lchown implements Linux syscall lchown(2). +func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) +} + +// Fchownat implements Linux syscall fchownat(2). +func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + owner := args[2].Int() + group := args[3].Int() + flags := args[4].Int() + return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) +} + +func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return err + } + + return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) +} + +func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { + userns := t.UserNamespace() + if owner != -1 { + kuid := userns.MapToKUID(auth.UID(owner)) + if !kuid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_UID + opts.Stat.UID = uint32(kuid) + } + if group != -1 { + kgid := userns.MapToKGID(auth.GID(group)) + if !kgid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_GID + opts.Stat.GID = uint32(kgid) + } + return nil +} + +// Fchown implements Linux syscall fchown(2). +func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + owner := args[1].Int() + group := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return 0, nil, err + } + return 0, nil, file.SetStat(t, opts) +} + +// Truncate implements Linux syscall truncate(2). +func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) +} + +// Ftruncate implements Linux syscall ftruncate(2). +func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) +} + +// Utime implements Linux syscall utime(2). +func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times linux.Utime + if err := times.CopyIn(t, timesAddr); err != nil { + return 0, nil, err + } + opts.Stat.Atime.Sec = times.Actime + opts.Stat.Mtime.Sec = times.Modtime + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimes implements Linux syscall utimes(2). +func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Usec * 1000), + } + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Usec * 1000), + } + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimensat implements Linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Futimens implements Linux syscall futimens(2). +func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + timesAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, file.SetStat(t, opts) +} + +func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil + } + var times [2]linux.Timespec + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Nsec != linux.UTIME_OMIT { + opts.Stat.Mask |= linux.STATX_ATIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Nsec), + } + } + if times[1].Nsec != linux.UTIME_OMIT { + opts.Stat.Mask |= linux.STATX_MTIME + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Nsec), + } + } + return nil +} + +func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.SetStat() instead of + // VirtualFilesystem.SetStatAt(), since the former may be able + // to use opened file state to expedite the SetStat. + err := dirfile.SetStat(t, *opts) + dirfile.DecRef() + return err + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go new file mode 100644 index 000000000..12c532310 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -0,0 +1,323 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Stat implements Linux syscall stat(2). +func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */) +} + +// Lstat implements Linux syscall lstat(2). +func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW) +} + +// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2). +func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + statAddr := args[2].Pointer() + flags := args[3].Int() + return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags) +} + +func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return stat.CopyOut(t, statAddr) + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return stat.CopyOut(t, statAddr) +} + +func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec { + return linux.Timespec{ + Sec: sxts.Sec, + Nsec: int64(sxts.Nsec), + } +} + +// Fstat implements Linux syscall fstat(2). +func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + statAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + statx, err := file.Stat(t, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + return 0, nil, err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return 0, nil, stat.CopyOut(t, statAddr) +} + +// Statx implements Linux syscall statx(2). +func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + mask := args[3].Uint() + statxAddr := args[4].Pointer() + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: mask, + Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE), + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for statx(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + return 0, nil, statx.CopyOut(t, statxAddr) + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + return 0, nil, statx.CopyOut(t, statxAddr) +} + +func userifyStatx(t *kernel.Task, statx *linux.Statx) { + userns := t.UserNamespace() + statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow()) + statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow()) +} + +// Readlink implements Linux syscall readlink(2). +func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + size := args[2].SizeT() + return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) +} + +// Access implements Linux syscall access(2). +func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // FIXME(jamieliu): actually implement + return 0, nil, nil +} + +// Faccessat implements Linux syscall access(2). +func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // FIXME(jamieliu): actually implement + return 0, nil, nil +} + +// Readlinkat implements Linux syscall mknodat(2). +func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + bufAddr := args[2].Pointer() + size := args[3].SizeT() + return readlinkat(t, dirfd, pathAddr, bufAddr, size) +} + +func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { + if int(size) <= 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + // "Since Linux 2.6.39, pathname can be an empty string, in which case the + // call operates on the symbolic link referred to by dirfd ..." - + // readlinkat(2) + tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + if len(target) > int(size) { + target = target[:size] + } + n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Statfs implements Linux syscall statfs(2). +func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, statfs.CopyOut(t, bufAddr) +} + +// Fstatfs implements Linux syscall fstatfs(2). +func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufAddr := args[1].Pointer() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, statfs.CopyOut(t, bufAddr) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go new file mode 100644 index 000000000..2da538fc6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat_amd64.go @@ -0,0 +1,46 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// This takes both input and output as pointer arguments to avoid copying large +// structs. +func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { + // Linux just copies fields from struct kstat without regard to struct + // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. + userns := t.UserNamespace() + *stat = linux.Stat{ + Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), + Ino: statx.Ino, + Nlink: uint64(statx.Nlink), + Mode: uint32(statx.Mode), + UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), + GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), + Size: int64(statx.Size), + Blksize: int64(statx.Blksize), + Blocks: int64(statx.Blocks), + ATime: timespecFromStatxTimestamp(statx.Atime), + MTime: timespecFromStatxTimestamp(statx.Mtime), + CTime: timespecFromStatxTimestamp(statx.Ctime), + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go new file mode 100644 index 000000000..88b9c7627 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat_arm64.go @@ -0,0 +1,46 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// This takes both input and output as pointer arguments to avoid copying large +// structs. +func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { + // Linux just copies fields from struct kstat without regard to struct + // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. + userns := t.UserNamespace() + *stat = linux.Stat{ + Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), + Ino: statx.Ino, + Nlink: uint32(statx.Nlink), + Mode: uint32(statx.Mode), + UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), + GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), + Size: int64(statx.Size), + Blksize: int32(statx.Blksize), + Blocks: int64(statx.Blocks), + ATime: timespecFromStatxTimestamp(statx.Atime), + MTime: timespecFromStatxTimestamp(statx.Mtime), + CTime: timespecFromStatxTimestamp(statx.Ctime), + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go new file mode 100644 index 000000000..365250b0b --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -0,0 +1,87 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements Linux syscall sync(2). +func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t) +} + +// Syncfs implements Linux syscall syncfs(2). +func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SyncFS(t) +} + +// Fsync implements Linux syscall fsync(2). +func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.Sync(t) +} + +// Fdatasync implements Linux syscall fdatasync(2). +func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata. + return Fsync(t, args) +} + +// SyncFileRange implements Linux syscall sync_file_range(2). +func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + nbytes := args[2].Int64() + flags := args[3].Uint() + + if offset < 0 { + return 0, nil, syserror.EINVAL + } + if nbytes < 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of + // [offset, offset+nbytes). + return 0, nil, file.Sync(t) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go deleted file mode 100644 index 7667524c7..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - // EventMaskRead contains events that can be triggered on reads. - EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr -) - -// Read implements linux syscall read(2). Note that we try to get a buffer that -// is exactly the size requested because some applications like qemu expect -// they can do large reads all at once. Bug for bug. Same for other read -// calls below. -func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - addr := args[1].Pointer() - size := args[2].SizeT() - - file := t.GetFileVFS2(fd) - if file == nil { - return 0, nil, syserror.EBADF - } - defer file.DecRef() - - // Check that the size is legitimate. - si := int(size) - if si < 0 { - return 0, nil, syserror.EINVAL - } - - // Get the destination of the read. - dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, nil, err - } - - n, err := read(t, file, dst, vfs.ReadOptions{}) - t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) -} - -func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - n, err := file.Read(t, dst, opts) - if err != syserror.ErrWouldBlock { - return n, err - } - - // Register for notifications. - w, ch := waiter.NewChannelEntry(nil) - file.EventRegister(&w, EventMaskRead) - - total := n - for { - // Shorten dst to reflect bytes previously read. - dst = dst.DropFirst(int(n)) - - // Issue the request and break out if it completes with anything other than - // "would block". - n, err := file.Read(t, dst, opts) - total += n - if err != syserror.ErrWouldBlock { - break - } - if err := t.Block(ch); err != nil { - break - } - } - file.EventUnregister(&w) - - return total, err -} diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go new file mode 100644 index 000000000..89e9ff4d7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -0,0 +1,353 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "bytes" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Listxattr implements Linux syscall listxattr(2). +func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, followFinalSymlink) +} + +// Llistxattr implements Linux syscall llistxattr(2). +func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, nofollowFinalSymlink) +} + +func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Flistxattr implements Linux syscall flistxattr(2). +func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + names, err := file.Listxattr(t) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Getxattr implements Linux syscall getxattr(2). +func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, followFinalSymlink) +} + +// Lgetxattr implements Linux syscall lgetxattr(2). +func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, nofollowFinalSymlink) +} + +func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Fgetxattr implements Linux syscall fgetxattr(2). +func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := file.Getxattr(t, name) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Setxattr implements Linux syscall setxattr(2). +func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, followFinalSymlink) +} + +// Lsetxattr implements Linux syscall lsetxattr(2). +func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, nofollowFinalSymlink) +} + +func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return err + } + + return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Fsetxattr implements Linux syscall fsetxattr(2). +func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Removexattr implements Linux syscall removexattr(2). +func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, followFinalSymlink) +} + +// Lremovexattr implements Linux syscall lremovexattr(2). +func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, nofollowFinalSymlink) +} + +func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name) +} + +// Fremovexattr implements Linux syscall fremovexattr(2). +func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Removexattr(t, name) +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) { + if size > linux.XATTR_LIST_MAX { + size = linux.XATTR_LIST_MAX + } + var buf bytes.Buffer + for _, name := range names { + buf.WriteString(name) + buf.WriteByte(0) + } + if size == 0 { + // Return the size that would be required to accomodate the list. + return buf.Len(), nil + } + if buf.Len() > int(size) { + if size >= linux.XATTR_LIST_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(listAddr, buf.Bytes()) +} + +func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) { + if size > linux.XATTR_SIZE_MAX { + return "", syserror.E2BIG + } + buf := make([]byte, size) + if _, err := t.CopyInBytes(valueAddr, buf); err != nil { + return "", err + } + return gohacks.StringFromImmutableBytes(buf), nil +} + +func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) { + if size > linux.XATTR_SIZE_MAX { + size = linux.XATTR_SIZE_MAX + } + if size == 0 { + // Return the size that would be required to accomodate the value. + return len(value), nil + } + if len(value) > int(size) { + if size >= linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) +} diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index 538c645eb..4320ad17f 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -253,6 +253,10 @@ func (m *MemoryLocked) Copy() (MemoryStats, uint64) { } // MinimumTotalMemoryBytes is the minimum reported total system memory. +// +// This can be configured through options provided to the Sentry at start. +// This number is purely synthetic. This is only set before the application +// starts executing, and must not be modified. var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB // TotalMemory returns the "total usable memory" available. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index ced9d07b1..cb4deb068 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -42,14 +42,21 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fd", "//pkg/fspath", + "//pkg/gohacks", + "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index d97362b9a..82781e6d3 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -29,9 +29,10 @@ const ( CtxRoot ) -// MountNamespaceFromContext returns the MountNamespace used by ctx. It does -// not take a reference on the returned MountNamespace. If ctx is not -// associated with a MountNamespace, MountNamespaceFromContext returns nil. +// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is +// not associated with a MountNamespace, MountNamespaceFromContext returns nil. +// +// A reference is taken on the returned MountNamespace. func MountNamespaceFromContext(ctx context.Context) *MountNamespace { if v := ctx.Value(CtxMountNamespace); v != nil { return v.(*MountNamespace) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 486a76475..35b208721 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -71,6 +71,8 @@ import ( // lifetime. Dentry reference counts only indicate the extent to which VFS // requires Dentries to exist; Filesystems may elect to cache or discard // Dentries with zero references. +// +// +stateify savable type Dentry struct { // parent is this Dentry's parent in this Filesystem. If this Dentry is // independent, parent is nil. @@ -89,7 +91,7 @@ type Dentry struct { children map[string]*Dentry // mu synchronizes disowning and mounting over this Dentry. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 3af2aa58d..bda5576fa 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -56,6 +56,7 @@ type Device interface { Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) } +// +stateify savable type registeredDevice struct { dev Device opts RegisterDeviceOptions @@ -63,6 +64,8 @@ type registeredDevice struct { // RegisterDeviceOptions contains options to // VirtualFilesystem.RegisterDevice(). +// +// +stateify savable type RegisterDeviceOptions struct { // GroupName is the name shown for this device registration in // /proc/devices. If GroupName is empty, this registration will not be diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 7c83f9a5a..3da45d744 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -85,8 +85,8 @@ type epollInterest struct { ready bool epollInterestEntry - // userData is the epoll_data_t associated with this epollInterest. - // userData is protected by epoll.mu. + // userData is the struct epoll_event::data associated with this + // epollInterest. userData is protected by epoll.mu. userData [2]int32 } @@ -157,7 +157,7 @@ func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) ( // AddInterest implements the semantics of EPOLL_CTL_ADD. // // Preconditions: A reference must be held on file. -func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error { +func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error { // Check for cyclic polling if necessary. subep, _ := file.impl.(*EpollInstance) if subep != nil { @@ -183,12 +183,12 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint } // Register interest in file. - mask |= linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP epi := &epollInterest{ epoll: ep, key: key, mask: mask, - userData: userData, + userData: event.Data, } ep.interest[key] = epi wmask := waiter.EventMaskFromLinux(mask) @@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint // Add epi to file.epolls so that it is removed when the last // FileDescription reference is dropped. file.epollMu.Lock() + if file.epolls == nil { + file.epolls = make(map[*epollInterest]struct{}) + } file.epolls[epi] = struct{}{} file.epollMu.Unlock() @@ -236,7 +239,7 @@ func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursi // ModifyInterest implements the semantics of EPOLL_CTL_MOD. // // Preconditions: A reference must be held on file. -func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error { +func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error { ep.interestMu.Lock() defer ep.interestMu.Unlock() @@ -250,13 +253,13 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask u } // Update epi for the next call to ep.ReadEvents(). + mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP ep.mu.Lock() epi.mask = mask - epi.userData = userData + epi.userData = event.Data ep.mu.Unlock() // Re-register with the new mask. - mask |= linux.EPOLLERR | linux.EPOLLRDHUP file.EventUnregister(&epi.waiter) wmask := waiter.EventMaskFromLinux(mask) file.EventRegister(&epi.waiter, wmask) @@ -363,8 +366,7 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { // Report ievents. events[i] = linux.EpollEvent{ Events: ievents.ToLinux(), - Fd: epi.userData[0], - Data: epi.userData[1], + Data: epi.userData, } i++ if i == len(events) { diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index badacb55e..9a1ad630c 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" @@ -393,7 +394,25 @@ type FileDescriptionImpl interface { // Removexattr removes the given extended attribute from the file. Removexattr(ctx context.Context, name string) error - // TODO: file locking + // LockBSD tries to acquire a BSD-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): BSD-style file locking + LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error + + // LockBSD releases a BSD-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): BSD-style file locking + UnlockBSD(ctx context.Context, uid lock.UniqueID) error + + // LockPOSIX tries to acquire a POSIX-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): POSIX-style file locking + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error + + // UnlockPOSIX releases a POSIX-style advisory file lock. + // + // TODO(gvisor.dev/issue/1480): POSIX-style file locking + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error } // Dirent holds the information contained in struct linux_dirent64. @@ -416,11 +435,11 @@ type Dirent struct { // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. type IterDirentsCallback interface { - // Handle handles the given iterated Dirent. It returns true if iteration - // should continue, and false if FileDescriptionImpl.IterDirents should - // terminate now and restart with the same Dirent the next time it is - // called. - Handle(dirent Dirent) bool + // Handle handles the given iterated Dirent. If Handle returns a non-nil + // error, FileDescriptionImpl.IterDirents must stop iteration and return + // the error; the next call to FileDescriptionImpl.IterDirents should + // restart with the same Dirent. + Handle(dirent Dirent) error } // OnClose is called when a file descriptor representing the FileDescription is diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index a4900c170..45191d1c3 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -32,8 +33,8 @@ import ( // implementations to adapt: // - Have a local fileDescription struct (containing FileDescription) which // embeds FileDescriptionDefaultImpl and overrides the default methods -// which are common to all fd implementations for that for that filesystem -// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. +// which are common to all fd implementations for that filesystem like +// StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. // - This should be embedded in all file description implementations as the // first field by value. // - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl. @@ -152,6 +153,26 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) return syserror.ENOTSUP } +// LockBSD implements FileDescriptionImpl.LockBSD. +func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { + return syserror.EBADF +} + +// UnlockBSD implements FileDescriptionImpl.UnlockBSD. +func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { + return syserror.EBADF +} + +// LockPOSIX implements FileDescriptionImpl.LockPOSIX. +func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { + return syserror.EBADF +} + +// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. +func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { + return syserror.EBADF +} + // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 8fa26418e..3a75d4d62 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := New() // vfs.New() + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) defer fd.DecRef() @@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) { func TestWritable(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := New() // vfs.New() + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) defer fd.DecRef() diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index a06a6caf3..556976d0b 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -29,6 +29,8 @@ import ( // Filesystem methods require that a reference is held. // // Filesystem is analogous to Linux's struct super_block. +// +// +stateify savable type Filesystem struct { // refs is the reference count. refs is accessed using atomic memory // operations. diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index c58b70728..bb9cada81 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -44,6 +44,7 @@ type GetFilesystemOptions struct { InternalData interface{} } +// +stateify savable type registeredFilesystemType struct { fsType FilesystemType opts RegisterFilesystemTypeOptions diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD new file mode 100644 index 000000000..d9ab063b7 --- /dev/null +++ b/pkg/sentry/vfs/lock/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "lock", + srcs = ["lock.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/fs/lock", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go new file mode 100644 index 000000000..724dfe743 --- /dev/null +++ b/pkg/sentry/vfs/lock/lock.go @@ -0,0 +1,72 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock provides POSIX and BSD style file locking for VFS2 file +// implementations. +// +// The actual implementations can be found in the lock package under +// sentry/fs/lock. +package lock + +import ( + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) +// and flock(2) respectively in Linux. It can be embedded into various file +// implementations for VFS2 that support locking. +// +// Note that in Linux these two types of locks are _not_ cooperative, because +// race and deadlock conditions make merging them prohibitive. We do the same +// and keep them oblivious to each other. +type FileLocks struct { + // bsd is a set of BSD-style advisory file wide locks, see flock(2). + bsd fslock.Locks + + // posix is a set of POSIX-style regional advisory locks, see fcntl(2). + posix fslock.Locks +} + +// LockBSD tries to acquire a BSD-style lock on the entire file. +func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockBSD releases a BSD-style lock on the entire file. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { + fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) +} + +// LockPOSIX tries to acquire a POSIX-style lock on a file region. +func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + if fl.posix.LockRegion(uid, t, rng, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockPOSIX releases a POSIX-style lock on a file region. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { + fl.posix.UnlockRegion(uid, rng) +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index d39528051..31a4e5480 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -38,6 +38,8 @@ import ( // // Mount is analogous to Linux's struct mount. (gVisor does not distinguish // between struct mount and struct vfsmount.) +// +// +stateify savable type Mount struct { // vfs, fs, and root are immutable. References are held on fs and root. // @@ -85,6 +87,8 @@ type Mount struct { // MountNamespace methods require that a reference is held. // // MountNamespace is analogous to Linux's struct mnt_namespace. +// +// +stateify savable type MountNamespace struct { // root is the MountNamespace's root mount. root is immutable. root *Mount @@ -114,6 +118,7 @@ type MountNamespace struct { func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { + ctx.Warningf("Unknown filesystem: %s", fsTypeName) return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) @@ -134,6 +139,23 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return mntns, nil } +// NewDisconnectedMount returns a Mount representing fs with the given root +// (which may be nil). The new Mount is not associated with any MountNamespace +// and is not connected to any other Mounts. References are taken on fs and +// root. +func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) { + fs.IncRef() + if root != nil { + root.IncRef() + } + return &Mount{ + vfs: vfs, + fs: fs, + root: root, + refs: 1, + }, nil +} + // MountAt creates and mounts a Filesystem configured by the given arguments. func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { rft := vfs.getFilesystemType(fsTypeName) @@ -231,9 +253,12 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti return syserror.EINVAL } vfs.mountMu.Lock() - if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns { - vfs.mountMu.Unlock() - return syserror.EINVAL + if mntns := MountNamespaceFromContext(ctx); mntns != nil { + defer mntns.DecRef() + if mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } } // TODO(jamieliu): Linux special-cases umount of the caller's root, which @@ -423,7 +448,8 @@ func (mntns *MountNamespace) IncRef() { } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) { +func (mntns *MountNamespace) DecRef() { + vfs := mntns.root.fs.VirtualFilesystem() if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index bd90d36c4..bc7581698 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -26,6 +26,7 @@ import ( "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) @@ -64,6 +65,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) { // (provided mutation is sufficiently uncommon). // // mountTable.Init() must be called on new mountTables before use. +// +// +stateify savable type mountTable struct { // mountTable is implemented as a seqcount-protected hash table that // resolves collisions with linear probing, featuring Robin Hood insertion @@ -75,8 +78,8 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq sync.SeqCount - seed uint32 // for hashing keys + seq sync.SeqCount `state:"nosave"` + seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of // slots): capacity is stored as its base-2 log (referred to as order) in @@ -89,7 +92,7 @@ type mountTable struct { // length and cap in separate uint32s) for ~free. size uint64 - slots unsafe.Pointer // []mountSlot; never nil after Init + slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init } type mountSlot struct { @@ -158,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer { // Lookup may be called even if there are concurrent mutators of mt. func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} - hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) loop: for { @@ -359,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr //go:linkname rand32 runtime.fastrand func rand32() uint32 - -// This is copy/pasted from runtime.noescape(), and is needed because arguments -// apparently escape from all functions defined by linkname. -// -//go:nosplit -func noescape(p unsafe.Pointer) unsafe.Pointer { - x := uintptr(p) - return unsafe.Pointer(x ^ 0) -} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index b7774bf28..6af7fdac1 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -61,7 +61,7 @@ type MountOptions struct { type OpenOptions struct { // Flags contains access mode and flags as specified for open(2). // - // FilesystemImpls is reponsible for implementing the following flags: + // FilesystemImpls are responsible for implementing the following flags: // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and @@ -72,6 +72,11 @@ type OpenOptions struct { // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the // created file. Mode linux.FileMode + + // FileExec is set when the file is being opened to be executed. + // VirtualFilesystem.OpenAt() checks that the caller has execute permissions + // on the file, and that the file is a regular file. + FileExec bool } // ReadOptions contains options to FileDescription.PRead(), diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f664581f4..8e250998a 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -103,17 +103,22 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. -func AccessTypesForOpenFlags(flags uint32) AccessTypes { - switch flags & linux.O_ACCMODE { +func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes { + ats := AccessTypes(0) + if opts.FileExec { + ats |= MayExec + } + + switch opts.Flags & linux.O_ACCMODE { case linux.O_RDONLY: - if flags&linux.O_TRUNC != 0 { - return MayRead | MayWrite + if opts.Flags&linux.O_TRUNC != 0 { + return ats | MayRead | MayWrite } - return MayRead + return ats | MayRead case linux.O_WRONLY: - return MayWrite + return ats | MayWrite default: - return MayRead | MayWrite + return ats | MayRead | MayWrite } } diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8a0b382f6..eb4ebb511 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() { rp.pit = next } else { // at end of path segment, continue with next one rp.curPart-- - rp.pit = rp.parts[rp.curPart-1] + rp.pit = rp.parts[rp.curPart] } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 908c69f91..bde81e1ef 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -46,11 +46,13 @@ import ( // // There is no analogue to the VirtualFilesystem type in Linux, as the // equivalent state in Linux is global. +// +// +stateify savable type VirtualFilesystem struct { // mountMu serializes mount mutations. // // mountMu is analogous to Linux's namespace_sem. - mountMu sync.Mutex + mountMu sync.Mutex `state:"nosave"` // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts // are uniquely namespaced, including mount parent in the key correctly @@ -89,56 +91,59 @@ type VirtualFilesystem struct { // devices contains all registered Devices. devices is protected by // devicesMu. - devicesMu sync.RWMutex + devicesMu sync.RWMutex `state:"nosave"` devices map[devTuple]*registeredDevice // anonBlockDevMinor contains all allocated anonymous block device minor // numbers. anonBlockDevMinorNext is a lower bound for the smallest // unallocated anonymous block device number. anonBlockDevMinorNext and // anonBlockDevMinor are protected by anonBlockDevMinorMu. - anonBlockDevMinorMu sync.Mutex + anonBlockDevMinorMu sync.Mutex `state:"nosave"` anonBlockDevMinorNext uint32 anonBlockDevMinor map[uint32]struct{} // fsTypes contains all registered FilesystemTypes. fsTypes is protected by // fsTypesMu. - fsTypesMu sync.RWMutex + fsTypesMu sync.RWMutex `state:"nosave"` fsTypes map[string]*registeredFilesystemType // filesystems contains all Filesystems. filesystems is protected by // filesystemsMu. - filesystemsMu sync.Mutex + filesystemsMu sync.Mutex `state:"nosave"` filesystems map[*Filesystem]struct{} } -// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. -func New() *VirtualFilesystem { - vfs := &VirtualFilesystem{ - mountpoints: make(map[*Dentry]map[*Mount]struct{}), - devices: make(map[devTuple]*registeredDevice), - anonBlockDevMinorNext: 1, - anonBlockDevMinor: make(map[uint32]struct{}), - fsTypes: make(map[string]*registeredFilesystemType), - filesystems: make(map[*Filesystem]struct{}), - } +// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. +func (vfs *VirtualFilesystem) Init() error { + vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) + vfs.devices = make(map[devTuple]*registeredDevice) + vfs.anonBlockDevMinorNext = 1 + vfs.anonBlockDevMinor = make(map[uint32]struct{}) + vfs.fsTypes = make(map[string]*registeredFilesystemType) + vfs.filesystems = make(map[*Filesystem]struct{}) vfs.mounts.Init() // Construct vfs.anonMount. anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() if err != nil { - panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err)) + // This shouldn't be possible since anonBlockDevMinorNext was + // initialized to 1 above (no device numbers have been allocated yet). + panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) } anonfs := anonFilesystem{ devMinor: anonfsDevMinor, } anonfs.vfsfs.Init(vfs, &anonfs) - vfs.anonMount = &Mount{ - vfs: vfs, - fs: &anonfs.vfsfs, - refs: 1, + defer anonfs.vfsfs.DecRef() + anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) + if err != nil { + // We should not be passing any MountOptions that would cause + // construction of this mount to fail. + panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err)) } + vfs.anonMount = anonMount - return vfs + return nil } // PathOperation specifies the path operated on by a VFS method. @@ -379,6 +384,22 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(rp) + + // TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call + // to FileDescription.Stat(). + if opts.FileExec { + // Only a regular file can be executed. + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) + if err != nil { + fd.DecRef() + return nil, err + } + if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { + fd.DecRef() + return nil, syserror.EACCES + } + } + return fd, nil } if !rp.handleError(err) { @@ -724,6 +745,8 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { // VirtualDentry methods require that a reference is held on the VirtualDentry. // // VirtualDentry is analogous to Linux's struct path. +// +// +stateify savable type VirtualDentry struct { mount *Mount dentry *Dentry diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index bfb2fac26..f7d6009a0 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -221,7 +221,7 @@ func (w *Watchdog) waitForStart() { return } var buf bytes.Buffer - buf.WriteString("Watchdog.Start() not called within %s:\n") + buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) w.doAction(w.StartupTimeoutAction, false, &buf) } @@ -325,7 +325,7 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo func (w *Watchdog) reportStuckWatchdog() { var buf bytes.Buffer - buf.WriteString("Watchdog goroutine is stuck:\n") + buf.WriteString("Watchdog goroutine is stuck:") w.doAction(w.TaskTimeoutAction, false, &buf) } @@ -359,7 +359,7 @@ func (w *Watchdog) doAction(action Action, skipStack bool, msg *bytes.Buffer) { case <-metricsEmitted: case <-time.After(1 * time.Second): } - panic(fmt.Sprintf("Stack for running G's are skipped while panicking.\n%s", msg.String())) + panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String())) default: panic(fmt.Sprintf("Unknown watchdog action %v", action)) |