diff options
Diffstat (limited to 'pkg/sentry')
572 files changed, 30555 insertions, 9511 deletions
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 901e0f320..4af4d6e84 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -22,6 +22,7 @@ go_library( "signal_info.go", "signal_stack.go", "stack.go", + "stack_unsafe.go", "syscalls_amd64.go", "syscalls_arm64.go", ], @@ -33,11 +34,12 @@ go_library( "//pkg/context", "//pkg/cpuid", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/sentry/limits", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", - "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index a903d031c..d75d665ae 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" ) @@ -72,12 +73,12 @@ type Context interface { // with return values of varying sizes (for example ARCH_GETFS). This // is a simple utility function to convert to the native size in these // cases, and then we can CopyOut. - Native(val uintptr) interface{} + Native(val uintptr) marshal.Marshallable // Value converts a native type back to a generic value. // Once a value has been converted to native via the above call -- it // can be converted back here. - Value(val interface{}) uintptr + Value(val marshal.Marshallable) uintptr // Width returns the number of bytes for a native value. Width() uint @@ -205,7 +206,7 @@ type Context interface { // equivalent of arch_ptrace(): // PtracePeekUser implements ptrace(PTRACE_PEEKUSR). - PtracePeekUser(addr uintptr) (interface{}, error) + PtracePeekUser(addr uintptr) (marshal.Marshallable, error) // PtracePokeUser implements ptrace(PTRACE_POKEUSR). PtracePokeUser(addr, data uintptr) error diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index 343f81f59..fd73751e7 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -17,7 +17,6 @@ package arch import ( - "encoding/binary" "fmt" "io" @@ -29,7 +28,14 @@ import ( ) // Registers represents the CPU registers for this architecture. -type Registers = linux.PtraceRegs +// +// +stateify savable +type Registers struct { + linux.PtraceRegs + + // TPIDR_EL0 is the EL0 Read/Write Software Thread ID Register. + TPIDR_EL0 uint64 +} const ( // SyscallWidth is the width of insturctions. @@ -49,9 +55,14 @@ const ARMTrapFlag = uint64(1) << 21 type aarch64FPState []byte // initAarch64FPState sets up initial state. +// +// Related code in Linux kernel: fpsimd_flush_thread(). +// FPCR = FPCR_RM_RN (0x0 << 22). +// +// Currently, aarch64FPState is only a space of 0x210 length for fpstate. +// The fp head is useless in sentry/ptrace/kvm. +// func initAarch64FPState(data aarch64FPState) { - binary.LittleEndian.PutUint32(data, fpsimdMagic) - binary.LittleEndian.PutUint32(data[4:], fpsimdContextSize) } func newAarch64FPStateSlice() []byte { @@ -90,6 +101,8 @@ func NewFloatingPointData() *FloatingPointData { // State contains the common architecture bits for aarch64 (the build tag of this // file ensures it's only built on aarch64). +// +// +stateify savable type State struct { // The system registers. Regs Registers @@ -97,9 +110,6 @@ type State struct { // Our floating point state. aarch64FPState `state:"wait"` - // TLS pointer - TPValue uint64 - // FeatureSet is a pointer to the currently active feature set. FeatureSet *cpuid.FeatureSet @@ -144,6 +154,7 @@ func (s State) Proto() *rpb.Registers { Sp: s.Regs.Sp, Pc: s.Regs.Pc, Pstate: s.Regs.Pstate, + Tls: s.Regs.TPIDR_EL0, } return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}} } @@ -153,7 +164,6 @@ func (s *State) Fork() State { return State{ Regs: s.Regs, aarch64FPState: s.aarch64FPState.fork(), - TPValue: s.TPValue, FeatureSet: s.FeatureSet, OrigR0: s.OrigR0, } @@ -223,6 +233,7 @@ func (s *State) RegisterMap() (map[string]uintptr, error) { "Sp": uintptr(s.Regs.Sp), "Pc": uintptr(s.Regs.Pc), "Pstate": uintptr(s.Regs.Pstate), + "Tls": uintptr(s.Regs.TPIDR_EL0), }, nil } @@ -237,18 +248,18 @@ func (s *State) ptraceGetRegs() Registers { return s.Regs } -var registersSize = (*Registers)(nil).SizeBytes() +var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { var regs Registers - buf := make([]byte, registersSize) + buf := make([]byte, ptraceRegistersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } regs.UnmarshalUnsafe(buf) s.Regs = regs - return registersSize, nil + return ptraceRegistersSize, nil } // PtraceGetFPRegs implements Context.PtraceGetFPRegs. @@ -274,7 +285,7 @@ const ( func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceGetRegs(dst) @@ -287,7 +298,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceSetRegs(src) diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 3b3a0a272..c7d3a206d 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -23,6 +23,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" ) @@ -179,14 +181,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { } // Native returns the native type for the given val. -func (c *context64) Native(val uintptr) interface{} { - v := uint64(val) +func (c *context64) Native(val uintptr) marshal.Marshallable { + v := primitive.Uint64(val) return &v } // Value returns the generic val for the given native type. -func (c *context64) Value(val interface{}) uintptr { - return uintptr(*val.(*uint64)) +func (c *context64) Value(val marshal.Marshallable) uintptr { + return uintptr(*val.(*primitive.Uint64)) } // Width returns the byte width of this architecture. @@ -293,14 +295,14 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr { const userStructSize = 928 // PtracePeekUser implements Context.PtracePeekUser. -func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { +func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) { if addr&7 != 0 || addr >= userStructSize { return nil, syscall.EIO } // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and // u_debugreg, returning 0 or silently no-oping for other fields // respectively. - if addr < uintptr(registersSize) { + if addr < uintptr(ptraceRegistersSize) { regs := c.ptraceGetRegs() buf := make([]byte, regs.SizeBytes()) regs.MarshalUnsafe(buf) @@ -315,7 +317,7 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error { if addr&7 != 0 || addr >= userStructSize { return syscall.EIO } - if addr < uintptr(registersSize) { + if addr < uintptr(ptraceRegistersSize) { regs := c.ptraceGetRegs() buf := make([]byte, regs.SizeBytes()) regs.MarshalUnsafe(buf) diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index ada7ac7b8..680d23a9f 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -22,6 +22,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/usermem" ) @@ -73,6 +75,8 @@ const ( ) // context64 represents an ARM64 context. +// +// +stateify savable type context64 struct { State sigFPState []aarch64FPState // fpstate to be restored on sigreturn. @@ -142,7 +146,7 @@ func (c *context64) SetStack(value uintptr) { // TLS returns the current TLS pointer. func (c *context64) TLS() uintptr { - return uintptr(c.TPValue) + return uintptr(c.Regs.TPIDR_EL0) } // SetTLS sets the current TLS pointer. Returns false if value is invalid. @@ -151,7 +155,7 @@ func (c *context64) SetTLS(value uintptr) bool { return false } - c.TPValue = uint64(value) + c.Regs.TPIDR_EL0 = uint64(value) return true } @@ -161,14 +165,14 @@ func (c *context64) SetOldRSeqInterruptedIP(value uintptr) { } // Native returns the native type for the given val. -func (c *context64) Native(val uintptr) interface{} { - v := uint64(val) +func (c *context64) Native(val uintptr) marshal.Marshallable { + v := primitive.Uint64(val) return &v } // Value returns the generic val for the given native type. -func (c *context64) Value(val interface{}) uintptr { - return uintptr(*val.(*uint64)) +func (c *context64) Value(val marshal.Marshallable) uintptr { + return uintptr(*val.(*primitive.Uint64)) } // Width returns the byte width of this architecture. @@ -272,7 +276,7 @@ func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr { } // PtracePeekUser implements Context.PtracePeekUser. -func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) { +func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) { // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64. return c.Native(0), nil } diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index dc458b37f..b9405b320 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -31,7 +31,11 @@ import ( ) // Registers represents the CPU registers for this architecture. -type Registers = linux.PtraceRegs +// +// +stateify savable +type Registers struct { + linux.PtraceRegs +} // System-related constants for x86. const ( @@ -311,12 +315,12 @@ func (s *State) ptraceGetRegs() Registers { return regs } -var registersSize = (*Registers)(nil).SizeBytes() +var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes() // PtraceSetRegs implements Context.PtraceSetRegs. func (s *State) PtraceSetRegs(src io.Reader) (int, error) { var regs Registers - buf := make([]byte, registersSize) + buf := make([]byte, ptraceRegistersSize) if _, err := io.ReadFull(src, buf); err != nil { return 0, err } @@ -374,7 +378,7 @@ func (s *State) PtraceSetRegs(src io.Reader) (int, error) { } regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable) s.Regs = regs - return registersSize, nil + return ptraceRegistersSize, nil } // isUserSegmentSelector returns true if the given segment selector specifies a @@ -543,7 +547,7 @@ const ( func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceGetRegs(dst) @@ -563,7 +567,7 @@ func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) { switch regset { case _NT_PRSTATUS: - if maxlen < registersSize { + if maxlen < ptraceRegistersSize { return 0, syserror.EFAULT } return s.PtraceSetRegs(src) diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto index 60c027aab..2727ba08a 100644 --- a/pkg/sentry/arch/registers.proto +++ b/pkg/sentry/arch/registers.proto @@ -83,6 +83,7 @@ message ARM64Registers { uint64 sp = 32; uint64 pc = 33; uint64 pstate = 34; + uint64 tls = 35; } message Registers { oneof arch { diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go index 32173aa20..d3e2324a8 100644 --- a/pkg/sentry/arch/signal_act.go +++ b/pkg/sentry/arch/signal_act.go @@ -14,7 +14,7 @@ package arch -import "gvisor.dev/gvisor/tools/go_marshal/marshal" +import "gvisor.dev/gvisor/pkg/marshal" // Special values for SignalAct.Handler. const ( diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 6fb756f0e..72e07a988 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -17,17 +17,19 @@ package arch import ( - "encoding/binary" "math" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/usermem" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the // second argument to signal handlers set by signal(2). +// +// +marshal type SignalContext64 struct { R8 uint64 R9 uint64 @@ -68,6 +70,8 @@ const ( ) // UContext64 is equivalent to ucontext_t on 64-bit x86. +// +// +marshal type UContext64 struct { Flags uint64 Link uint64 @@ -172,12 +176,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt // "... the value (%rsp+8) is always a multiple of 16 (...) when // control is transferred to the function entry point." - AMD64 ABI - ucSize := binary.Size(uc) - if ucSize < 0 { - // This can only happen if we've screwed up the definition of - // UContext64. - panic("can't get size of UContext64") - } + ucSize := uc.SizeBytes() // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128. frameSize := int(st.Arch.Width()) + ucSize + 128 frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8 @@ -195,18 +194,18 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt info.FixSignalCodeForUser() // Set up the stack frame. - infoAddr, err := st.Push(info) - if err != nil { + if _, err := info.CopyOut(st, StackBottomMagic); err != nil { return err } - ucAddr, err := st.Push(uc) - if err != nil { + infoAddr := st.Bottom + if _, err := uc.CopyOut(st, StackBottomMagic); err != nil { return err } + ucAddr := st.Bottom if act.HasRestorer() { // Push the restorer return address. // Note that this doesn't need to be popped. - if _, err := st.Push(usermem.Addr(act.Restorer)); err != nil { + if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil { return err } } else { @@ -240,11 +239,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) { // Copy out the stack frame. var uc UContext64 - if _, err := st.Pop(&uc); err != nil { + if _, err := uc.CopyIn(st, StackBottomMagic); err != nil { return 0, SignalStack{}, err } var info SignalInfo - if _, err := st.Pop(&info); err != nil { + if _, err := info.CopyIn(st, StackBottomMagic); err != nil { return 0, SignalStack{}, err } diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 642c79dda..7fde5d34e 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package arch import ( - "encoding/binary" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +26,8 @@ import ( // SignalContext64 is equivalent to struct sigcontext, the type passed as the // second argument to signal handlers set by signal(2). +// +// +marshal type SignalContext64 struct { FaultAddr uint64 Regs [31]uint64 @@ -36,6 +39,7 @@ type SignalContext64 struct { Reserved [3568]uint8 } +// +marshal type aarch64Ctx struct { Magic uint32 Size uint32 @@ -43,6 +47,8 @@ type aarch64Ctx struct { // FpsimdContext is equivalent to struct fpsimd_context on arm64 // (arch/arm64/include/uapi/asm/sigcontext.h). +// +// +marshal type FpsimdContext struct { Head aarch64Ctx Fpsr uint32 @@ -51,13 +57,15 @@ type FpsimdContext struct { } // UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h). +// +// +marshal type UContext64 struct { Flags uint64 Link uint64 Stack SignalStack Sigset linux.SignalSet // glibc uses a 1024-bit sigset_t - _pad [(1024 - 64) / 8]byte + _pad [120]byte // (1024 - 64) / 8 = 120 // sigcontext must be aligned to 16-byte _pad2 [8]byte // last for future expansion @@ -94,11 +102,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt }, Sigset: sigset, } - - ucSize := binary.Size(uc) - if ucSize < 0 { - panic("can't get size of UContext64") - } + ucSize := uc.SizeBytes() // frameSize = ucSize + sizeof(siginfo). // sizeof(siginfo) == 128. @@ -119,14 +123,14 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt info.FixSignalCodeForUser() // Set up the stack frame. - infoAddr, err := st.Push(info) - if err != nil { + if _, err := info.CopyOut(st, StackBottomMagic); err != nil { return err } - ucAddr, err := st.Push(uc) - if err != nil { + infoAddr := st.Bottom + if _, err := uc.CopyOut(st, StackBottomMagic); err != nil { return err } + ucAddr := st.Bottom // Set up registers. c.Regs.Sp = uint64(st.Bottom) @@ -147,11 +151,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) { // Copy out the stack frame. var uc UContext64 - if _, err := st.Pop(&uc); err != nil { + if _, err := uc.CopyIn(st, StackBottomMagic); err != nil { return 0, SignalStack{}, err } var info SignalInfo - if _, err := st.Pop(&info); err != nil { + if _, err := info.CopyIn(st, StackBottomMagic); err != nil { return 0, SignalStack{}, err } diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go index 0fa738a1d..a1eae98f9 100644 --- a/pkg/sentry/arch/signal_stack.go +++ b/pkg/sentry/arch/signal_stack.go @@ -17,8 +17,8 @@ package arch import ( + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/tools/go_marshal/marshal" ) const ( diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go index 1108fa0bd..5f06c751d 100644 --- a/pkg/sentry/arch/stack.go +++ b/pkg/sentry/arch/stack.go @@ -15,14 +15,16 @@ package arch import ( - "encoding/binary" - "fmt" - "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/usermem" ) -// Stack is a simple wrapper around a usermem.IO and an address. +// Stack is a simple wrapper around a usermem.IO and an address. Stack +// implements marshal.CopyContext, and marshallable values can be pushed or +// popped from the stack through the marshal.Marshallable interface. +// +// Stack is not thread-safe. type Stack struct { // Our arch info. // We use this for automatic Native conversion of usermem.Addrs during @@ -34,105 +36,60 @@ type Stack struct { // Our current stack bottom. Bottom usermem.Addr -} -// Push pushes the given values on to the stack. -// -// (This method supports Addrs and treats them as native types.) -func (s *Stack) Push(vals ...interface{}) (usermem.Addr, error) { - for _, v := range vals { - - // We convert some types to well-known serializable quanities. - var norm interface{} - - // For array types, we will automatically add an appropriate - // terminal value. This is done simply to make the interface - // easier to use. - var term interface{} - - switch v.(type) { - case string: - norm = []byte(v.(string)) - term = byte(0) - case []int8, []uint8: - norm = v - term = byte(0) - case []int16, []uint16: - norm = v - term = uint16(0) - case []int32, []uint32: - norm = v - term = uint32(0) - case []int64, []uint64: - norm = v - term = uint64(0) - case []usermem.Addr: - // Special case: simply push recursively. - _, err := s.Push(s.Arch.Native(uintptr(0))) - if err != nil { - return 0, err - } - varr := v.([]usermem.Addr) - for i := len(varr) - 1; i >= 0; i-- { - _, err := s.Push(varr[i]) - if err != nil { - return 0, err - } - } - continue - case usermem.Addr: - norm = s.Arch.Native(uintptr(v.(usermem.Addr))) - default: - norm = v - } + // Scratch buffer used for marshalling to avoid having to repeatedly + // allocate scratch memory. + scratchBuf []byte +} - if term != nil { - _, err := s.Push(term) - if err != nil { - return 0, err - } - } +// scratchBufLen is the default length of Stack.scratchBuf. The +// largest structs the stack regularly serializes are arch.SignalInfo +// and arch.UContext64. We'll set the default size as the larger of +// the two, arch.UContext64. +var scratchBufLen = (*UContext64)(nil).SizeBytes() - c := binary.Size(norm) - if c < 0 { - return 0, fmt.Errorf("bad binary.Size for %T", v) - } - n, err := usermem.CopyObjectOut(context.Background(), s.IO, s.Bottom-usermem.Addr(c), norm, usermem.IOOpts{}) - if err != nil || c != n { - return 0, err - } +// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer. +func (s *Stack) CopyScratchBuffer(size int) []byte { + if len(s.scratchBuf) < size { + s.scratchBuf = make([]byte, size) + } + return s.scratchBuf[:size] +} +// StackBottomMagic is the special address callers must past to all stack +// marshalling operations to cause the src/dst address to be computed based on +// the current end of the stack. +const StackBottomMagic = ^usermem.Addr(0) // usermem.Addr(-1) + +// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes +// computes an appropriate address based on the current end of the +// stack. Callers use the sentinel address StackBottomMagic to marshal methods +// to indicate this. +func (s *Stack) CopyOutBytes(sentinel usermem.Addr, b []byte) (int, error) { + if sentinel != StackBottomMagic { + panic("Attempted to copy out to stack with absolute address") + } + c := len(b) + n, err := s.IO.CopyOut(context.Background(), s.Bottom-usermem.Addr(c), b, usermem.IOOpts{}) + if err == nil && n == c { s.Bottom -= usermem.Addr(n) } - - return s.Bottom, nil + return n, err } -// Pop pops the given values off the stack. -// -// (This method supports Addrs and treats them as native types.) -func (s *Stack) Pop(vals ...interface{}) (usermem.Addr, error) { - for _, v := range vals { - - vaddr, isVaddr := v.(*usermem.Addr) - - var n int - var err error - if isVaddr { - value := s.Arch.Native(uintptr(0)) - n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, value, usermem.IOOpts{}) - *vaddr = usermem.Addr(s.Arch.Value(value)) - } else { - n, err = usermem.CopyObjectIn(context.Background(), s.IO, s.Bottom, v, usermem.IOOpts{}) - } - if err != nil { - return 0, err - } - +// CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes +// an appropriate address based on the current end of the stack. Callers must +// use the sentinel address StackBottomMagic to marshal methods to indicate +// this. +func (s *Stack) CopyInBytes(sentinel usermem.Addr, b []byte) (int, error) { + if sentinel != StackBottomMagic { + panic("Attempted to copy in from stack with absolute address") + } + n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{}) + if err == nil { s.Bottom += usermem.Addr(n) } - - return s.Bottom, nil + return n, err } // Align aligns the stack to the given offset. @@ -142,6 +99,22 @@ func (s *Stack) Align(offset int) { } } +// PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null +// byte at the end. On error, the contents of the stack and the bottom cursor +// are undefined. +func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) { + // Note: Stack grows up, so write the terminal null byte first. + nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0) + if err != nil { + return 0, err + } + n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs) + if err != nil { + return 0, err + } + return n + nNull, nil +} + // StackLayout describes the location of the arguments and environment on the // stack. type StackLayout struct { @@ -177,11 +150,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) l.EnvvEnd = s.Bottom envAddrs := make([]usermem.Addr, len(env)) for i := len(env) - 1; i >= 0; i-- { - addr, err := s.Push(env[i]) - if err != nil { + if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil { return StackLayout{}, err } - envAddrs[i] = addr + envAddrs[i] = s.Bottom } l.EnvvStart = s.Bottom @@ -189,11 +161,10 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) l.ArgvEnd = s.Bottom argAddrs := make([]usermem.Addr, len(args)) for i := len(args) - 1; i >= 0; i-- { - addr, err := s.Push(args[i]) - if err != nil { + if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil { return StackLayout{}, err } - argAddrs[i] = addr + argAddrs[i] = s.Bottom } l.ArgvStart = s.Bottom @@ -222,26 +193,26 @@ func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) auxv = append(auxv, usermem.Addr(a.Key), a.Value) } auxv = append(auxv, usermem.Addr(0)) - _, err := s.Push(auxv) + _, err := s.pushAddrSliceAndTerminator(auxv) if err != nil { return StackLayout{}, err } // Push environment. - _, err = s.Push(envAddrs) + _, err = s.pushAddrSliceAndTerminator(envAddrs) if err != nil { return StackLayout{}, err } // Push args. - _, err = s.Push(argAddrs) + _, err = s.pushAddrSliceAndTerminator(argAddrs) if err != nil { return StackLayout{}, err } // Push arg count. - _, err = s.Push(usermem.Addr(len(args))) - if err != nil { + lenP := s.Arch.Native(uintptr(len(args))) + if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil { return StackLayout{}, err } diff --git a/pkg/sentry/arch/stack_unsafe.go b/pkg/sentry/arch/stack_unsafe.go new file mode 100644 index 000000000..a90d297ee --- /dev/null +++ b/pkg/sentry/arch/stack_unsafe.go @@ -0,0 +1,69 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arch + +import ( + "reflect" + "runtime" + "unsafe" + + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/usermem" +) + +// pushAddrSliceAndTerminator copies a slices of addresses to the stack, and +// also pushes an extra null address element at the end of the slice. +// +// Internally, we unsafely transmute the slice type from the arch-dependent +// []usermem.Addr type, to a slice of fixed-sized ints so that we can pass it to +// go-marshal. +// +// On error, the contents of the stack and the bottom cursor are undefined. +func (s *Stack) pushAddrSliceAndTerminator(src []usermem.Addr) (int, error) { + // Note: Stack grows upwards, so push the terminator first. + srcHdr := (*reflect.SliceHeader)(unsafe.Pointer(&src)) + switch s.Arch.Width() { + case 8: + nNull, err := primitive.CopyUint64Out(s, StackBottomMagic, 0) + if err != nil { + return 0, err + } + var dst []uint64 + dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) + dstHdr.Data = srcHdr.Data + dstHdr.Len = srcHdr.Len + dstHdr.Cap = srcHdr.Cap + n, err := primitive.CopyUint64SliceOut(s, StackBottomMagic, dst) + // Ensures src doesn't get GCed until we're done using it through dst. + runtime.KeepAlive(src) + return n + nNull, err + case 4: + nNull, err := primitive.CopyUint32Out(s, StackBottomMagic, 0) + if err != nil { + return 0, err + } + var dst []uint32 + dstHdr := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) + dstHdr.Data = srcHdr.Data + dstHdr.Len = srcHdr.Len + dstHdr.Cap = srcHdr.Cap + n, err := primitive.CopyUint32SliceOut(s, StackBottomMagic, dst) + // Ensure src doesn't get GCed until we're done using it through dst. + runtime.KeepAlive(src) + return n + nNull, err + default: + panic("Unsupported arch width") + } +} diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go index 8e5658c7a..dfd195a23 100644 --- a/pkg/sentry/contexttest/contexttest.go +++ b/pkg/sentry/contexttest/contexttest.go @@ -144,27 +144,7 @@ func (t *TestContext) MemoryFile() *pgalloc.MemoryFile { // RootContext returns a Context that may be used in tests that need root // credentials. Uses ptrace as the platform.Platform. func RootContext(tb testing.TB) context.Context { - return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace())) -} - -// WithCreds returns a copy of ctx carrying creds. -func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context { - return &authContext{ctx, creds} -} - -type authContext struct { - context.Context - creds *auth.Credentials -} - -// Value implements context.Context. -func (ac *authContext) Value(key interface{}) interface{} { - switch key { - case auth.CtxCredentials: - return ac.creds - default: - return ac.Context.Value(key) - } + return auth.ContextWithCredentials(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace())) } // WithLimitSet returns a copy of ctx carrying l. diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 0c9a62f0d..deaf5fa23 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -16,15 +16,12 @@ go_library( ], deps = [ "//pkg/abi/linux", - "//pkg/context", "//pkg/fd", - "//pkg/fspath", "//pkg/log", "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/host", "//pkg/sentry/fs/user", - "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/host", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -36,10 +33,8 @@ go_library( "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/link/sniffer", "//pkg/urpc", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/control/logging.go b/pkg/sentry/control/logging.go index 811f24324..8a500a515 100644 --- a/pkg/sentry/control/logging.go +++ b/pkg/sentry/control/logging.go @@ -70,8 +70,8 @@ type LoggingArgs struct { type Logging struct{} // Change will change the log level and strace arguments. Although -// this functions signature requires an error it never acctually -// return san error. It's required by the URPC interface. +// this functions signature requires an error it never actually +// returns an error. It's required by the URPC interface. // Additionally, it may look odd that this is the only method // attached to an empty struct but this is also part of how // URPC dispatches. diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index 663e51989..2bf3c45e1 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -49,6 +49,9 @@ type ProfileOpts struct { // - dump out the stack trace of current go routines. // sentryctl -pid <pid> pprof-goroutine type Profile struct { + // Kernel is the kernel under profile. It's immutable. + Kernel *kernel.Kernel + // mu protects the fields below. mu sync.Mutex @@ -57,9 +60,6 @@ type Profile struct { // traceFile is the current execution trace output file. traceFile *fd.FD - - // Kernel is the kernel under profile. - Kernel *kernel.Kernel } // StartCPUProfile is an RPC stub which starts recording the CPU profile in a diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 8767430b7..1d88db12f 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -23,15 +23,12 @@ import ( "text/tabwriter" "time" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fs/user" - "gvisor.dev/gvisor/pkg/sentry/fsbridge" hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -39,7 +36,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/urpc" ) @@ -107,6 +103,9 @@ type ExecArgs struct { // String prints the arguments as a string. func (args ExecArgs) String() string { + if len(args.Argv) == 0 { + return args.Filename + } a := make([]string, len(args.Argv)) copy(a, args.Argv) if args.Filename != "" { @@ -140,7 +139,6 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() - defer fdTable.DecRef() creds := auth.NewUserCredentials( args.KUID, @@ -178,60 +176,44 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI initArgs.MountNamespaceVFS2.IncRef() } ctx := initArgs.NewContext(proc.Kernel) + defer fdTable.DecRef(ctx) + + if kernel.VFS2Enabled { + // Get the full path to the filename from the PATH env variable. + if initArgs.MountNamespaceVFS2 == nil { + // Set initArgs so that 'ctx' returns the namespace. + // + // Add a reference to the namespace, which is transferred to the new process. + initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() + initArgs.MountNamespaceVFS2.IncRef() + } + } else { + if initArgs.MountNamespace == nil { + // Set initArgs so that 'ctx' returns the namespace. + initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() - if initArgs.Filename == "" { - if kernel.VFS2Enabled { - // Get the full path to the filename from the PATH env variable. - if initArgs.MountNamespaceVFS2 == nil { - // Set initArgs so that 'ctx' returns the namespace. - // - // MountNamespaceVFS2 adds a reference to the namespace, which is - // transferred to the new process. - initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() - } - file, err := getExecutableFD(ctx, creds, proc.Kernel.VFS(), initArgs.MountNamespaceVFS2, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0]) - if err != nil { - return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in environment %v: %v", initArgs.Argv[0], initArgs.Envv, err) - } - initArgs.File = fsbridge.NewVFSFile(file) - } else { - if initArgs.MountNamespace == nil { - // Set initArgs so that 'ctx' returns the namespace. - initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() - - // initArgs must hold a reference on MountNamespace, which will - // be donated to the new process in CreateProcess. - initArgs.MountNamespace.IncRef() - } - f, err := user.ResolveExecutablePath(ctx, creds, initArgs.MountNamespace, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0]) - if err != nil { - return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], initArgs.Envv, err) - } - initArgs.Filename = f + // initArgs must hold a reference on MountNamespace, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespace.IncRef() } } + resolved, err := user.ResolveExecutablePath(ctx, &initArgs) + if err != nil { + return nil, 0, nil, nil, err + } + initArgs.Filename = resolved - fds := make([]int, len(args.FilePayload.Files)) - for i, file := range args.FilePayload.Files { - if kernel.VFS2Enabled { - // Need to dup to remove ownership from os.File. - dup, err := unix.Dup(int(file.Fd())) - if err != nil { - return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err) - } - fds[i] = dup - } else { - // VFS1 dups the file on import. - fds[i] = int(file.Fd()) - } + fds, err := fd.NewFromFiles(args.Files) + if err != nil { + return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err) } + defer func() { + for _, fd := range fds { + _ = fd.Close() + } + }() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) if err != nil { - if kernel.VFS2Enabled { - for _, fd := range fds { - unix.Close(fd) - } - } return nil, 0, nil, nil, err } @@ -422,31 +404,3 @@ func ttyName(tty *kernel.TTY) string { } return fmt.Sprintf("pts/%d", tty.Index) } - -// getExecutableFD resolves the given executable name and returns a -// vfs.FileDescription for the executable file. -func getExecutableFD(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, mns *vfs.MountNamespace, envv []string, wd, name string) (*vfs.FileDescription, error) { - path, err := user.ResolveExecutablePathVFS2(ctx, creds, mns, envv, wd, name) - if err != nil { - return nil, err - } - - root := vfs.RootFromContext(ctx) - defer root.DecRef() - - pop := vfs.PathOperation{ - Root: root, - Start: root, // binPath is absolute, Start can be anything. - Path: fspath.Parse(path), - FollowFinalSymlink: true, - } - opts := &vfs.OpenOptions{ - Flags: linux.O_RDONLY, - FileExec: true, - } - f, err := vfsObj.OpenAt(ctx, creds, &pop, opts) - if err == syserror.ENOENT || err == syserror.EACCES { - return nil, nil - } - return f, err -} diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go index 41feeffe3..d800f2c85 100644 --- a/pkg/sentry/control/state.go +++ b/pkg/sentry/control/state.go @@ -69,5 +69,5 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error { s.Kernel.Kill(kernel.ExitStatus{}) }, } - return saveOpts.Save(s.Kernel, s.Watchdog) + return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog) } diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go index 69e71e322..6ca9dc79f 100644 --- a/pkg/sentry/device/device.go +++ b/pkg/sentry/device/device.go @@ -188,6 +188,9 @@ type MultiDevice struct { // String stringifies MultiDevice. func (m *MultiDevice) String() string { + m.mu.Lock() + defer m.mu.Unlock() + buf := bytes.NewBuffer(nil) buf.WriteString("cache{") for k, v := range m.cache { @@ -253,7 +256,7 @@ func (m *MultiDevice) Load(key MultiDeviceKey, value uint64) bool { } if k, exists := m.rcache[value]; exists && k != key { // Should never happen. - panic("MultiDevice's caches are inconsistent") + panic(fmt.Sprintf("MultiDevice's caches are inconsistent, current: %+v, previous: %+v", key, k)) } // Cache value at key. diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD index abe58f818..4c8604d58 100644 --- a/pkg/sentry/devices/memdev/BUILD +++ b/pkg/sentry/devices/memdev/BUILD @@ -18,9 +18,10 @@ go_library( "//pkg/rand", "//pkg/safemem", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/mm", - "//pkg/sentry/pgalloc", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index c7e197691..fece3e762 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -24,6 +24,8 @@ import ( const fullDevMinor = 7 // fullDevice implements vfs.Device for /dev/full. +// +// +stateify savable type fullDevice struct{} // Open implements vfs.Device.Open. @@ -38,14 +40,17 @@ func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op } // fullFD implements vfs.FileDescriptionImpl for /dev/full. +// +// +stateify savable type fullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *fullFD) Release() { +func (fd *fullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go index 33d060d02..ff5837747 100644 --- a/pkg/sentry/devices/memdev/null.go +++ b/pkg/sentry/devices/memdev/null.go @@ -25,6 +25,8 @@ import ( const nullDevMinor = 3 // nullDevice implements vfs.Device for /dev/null. +// +// +stateify savable type nullDevice struct{} // Open implements vfs.Device.Open. @@ -39,14 +41,17 @@ func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op } // nullFD implements vfs.FileDescriptionImpl for /dev/null. +// +// +stateify savable type nullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nullFD) Release() { +func (fd *nullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go index acfa23149..ac943e3ba 100644 --- a/pkg/sentry/devices/memdev/random.go +++ b/pkg/sentry/devices/memdev/random.go @@ -30,6 +30,8 @@ const ( ) // randomDevice implements vfs.Device for /dev/random and /dev/urandom. +// +// +stateify savable type randomDevice struct{} // Open implements vfs.Device.Open. @@ -44,10 +46,13 @@ func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, } // randomFD implements vfs.FileDescriptionImpl for /dev/random. +// +// +stateify savable type randomFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // off is the "file offset". off is accessed using atomic memory // operations. @@ -55,7 +60,7 @@ type randomFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *randomFD) Release() { +func (fd *randomFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index 3b1372b9e..1929e41cd 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -16,9 +16,10 @@ package memdev import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -26,6 +27,8 @@ import ( const zeroDevMinor = 5 // zeroDevice implements vfs.Device for /dev/zero. +// +// +stateify savable type zeroDevice struct{} // Open implements vfs.Device.Open. @@ -40,14 +43,17 @@ func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op } // zeroFD implements vfs.FileDescriptionImpl for /dev/zero. +// +// +stateify savable type zeroFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *zeroFD) Release() { +func (fd *zeroFD) Release(context.Context) { // noop } @@ -78,11 +84,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) + if opts.Private || !opts.MaxPerms.Write { + // This mapping will never permit writing to the "underlying file" (in + // Linux terms, it isn't VM_SHARED), so implement it as an anonymous + // mapping, but back it with fd; this is what Linux does, and is + // actually application-visible because the resulting VMA will show up + // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than + // "/dev/zero (deleted)". + opts.Offset = 0 + opts.MappingIdentity = &fd.vfsfd + opts.MappingIdentity.IncRef() + return nil + } + tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length) if err != nil { return err } - opts.MappingIdentity = m - opts.Mappable = m - return nil + defer tmpfsFD.DecRef(ctx) + return tmpfsFD.ConfigureMMap(ctx, opts) } diff --git a/pkg/sentry/devices/ttydev/BUILD b/pkg/sentry/devices/ttydev/BUILD new file mode 100644 index 000000000..b4b6ca38a --- /dev/null +++ b/pkg/sentry/devices/ttydev/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "ttydev", + srcs = ["ttydev.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go new file mode 100644 index 000000000..a287c65ca --- /dev/null +++ b/pkg/sentry/devices/ttydev/ttydev.go @@ -0,0 +1,53 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ttydev implements an unopenable vfs.Device for /dev/tty. +package ttydev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + // See drivers/tty/tty_io.c:tty_init(). + ttyDevMinor = 0 + consoleDevMinor = 1 +) + +// ttyDevice implements vfs.Device for /dev/tty. +// +// +stateify savable +type ttyDevice struct{} + +// Open implements vfs.Device.Open. +func (ttyDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.EIO +} + +// Register registers all devices implemented by this package in vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + return vfsObj.RegisterDevice(vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, ttyDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "tty", + }) +} + +// CreateDevtmpfsFiles creates device special files in dev representing all +// devices implemented by this package. +func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { + return dev.CreateDeviceFile(ctx, "tty", vfs.CharDevice, linux.TTYAUX_MAJOR, ttyDevMinor, 0666 /* mode */) +} diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD new file mode 100644 index 000000000..14a8bf9cd --- /dev/null +++ b/pkg/sentry/devices/tundev/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "tundev", + srcs = ["tundev.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/socket/netstack", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/tcpip/link/tun", + "//pkg/tcpip/network/arp", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go new file mode 100644 index 000000000..ff5d49fbd --- /dev/null +++ b/pkg/sentry/devices/tundev/tundev.go @@ -0,0 +1,191 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tundev implements the /dev/net/tun device. +package tundev + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/link/tun" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + netTunDevMajor = 10 + netTunDevMinor = 200 +) + +// tunDevice implements vfs.Device for /dev/net/tun. +// +// +stateify savable +type tunDevice struct{} + +// Open implements vfs.Device.Open. +func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &tunFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// tunFD implements vfs.FileDescriptionImpl for /dev/net/tun. +// +// +stateify savable +type tunFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + device tun.Device +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + request := args[1].Uint() + data := args[2].Pointer() + + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("Ioctl should be called from a task context") + } + + switch request { + case linux.TUNSETIFF: + if !t.HasCapability(linux.CAP_NET_ADMIN) { + return 0, syserror.EPERM + } + stack, ok := t.NetworkContext().(*netstack.Stack) + if !ok { + return 0, syserror.EINVAL + } + + var req linux.IFReq + if _, err := req.CopyIn(t, data); err != nil { + return 0, err + } + flags := usermem.ByteOrder.Uint16(req.Data[:]) + created, err := fd.device.SetIff(stack.Stack, req.Name(), flags) + if err == nil && created { + // Always start with an ARP address for interfaces so they can handle ARP + // packets. + nicID := fd.device.NICID() + if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID)) + } + } + return 0, err + + case linux.TUNGETIFF: + var req linux.IFReq + + copy(req.IFName[:], fd.device.Name()) + + // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when + // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c. + flags := fd.device.Flags() | linux.IFF_NOFILTER + usermem.ByteOrder.PutUint16(req.Data[:], flags) + + _, err := req.CopyOut(t, data) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *tunFD) Release(ctx context.Context) { + fd.device.Release(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.Read(ctx, dst, opts) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + data, err := fd.device.Read() + if err != nil { + return 0, err + } + n, err := dst.CopyOut(ctx, data) + if n > 0 && n < len(data) { + // Not an error for partial copying. Packet truncated. + err = nil + } + return int64(n), err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.Write(ctx, src, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + data := make([]byte, src.NumBytes()) + if _, err := src.CopyIn(ctx, data); err != nil { + return 0, err + } + return fd.device.Write(data) +} + +// Readiness implements watier.Waitable.Readiness. +func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask { + return fd.device.Readiness(mask) +} + +// EventRegister implements watier.Waitable.EventRegister. +func (fd *tunFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.device.EventRegister(e, mask) +} + +// EventUnregister implements watier.Waitable.EventUnregister. +func (fd *tunFD) EventUnregister(e *waiter.Entry) { + fd.device.EventUnregister(e) +} + +// IsNetTunSupported returns whether /dev/net/tun device is supported for s. +func IsNetTunSupported(s inet.Stack) bool { + _, ok := s.(*netstack.Stack) + return ok +} + +// Register registers all devices implemented by this package in vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{}) +} + +// CreateDevtmpfsFiles creates device special files in dev representing all +// devices implemented by this package. +func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { + return dev.CreateDeviceFile(ctx, "net/tun", vfs.CharDevice, netTunDevMajor, netTunDevMinor, 0666 /* mode */) +} diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD index 5e41ceb4e..6b4f8b0ed 100644 --- a/pkg/sentry/fdimport/BUILD +++ b/pkg/sentry/fdimport/BUILD @@ -10,6 +10,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/context", + "//pkg/fd", "//pkg/sentry/fs", "//pkg/sentry/fs/host", "//pkg/sentry/fsimpl/host", diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index a4199f9e9..314661475 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -15,7 +15,10 @@ package fdimport import ( + "fmt" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" @@ -25,8 +28,9 @@ import ( // Import imports a slice of FDs into the given FDTable. If console is true, // sets up TTY for the first 3 FDs in the slice representing stdin, stdout, -// stderr. Upon success, Import takes ownership of all FDs. -func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +// stderr. Used FDs are either closed or released. It's safe for the caller to +// close any remaining files upon return. +func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if kernel.VFS2Enabled { ttyFile, err := importVFS2(ctx, fdTable, console, fds) return nil, ttyFile, err @@ -35,7 +39,7 @@ func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []in return ttyFile, nil, err } -func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) { +func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []*fd.FD) (*host.TTYFileOperations, error) { var ttyFile *fs.File for appFD, hostFD := range fds { var appFile *fs.File @@ -44,11 +48,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD.FD(), true /* isTTY */) if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) + _ = hostFD.Close() // FD is dup'd i ImportFile. // Remember this in the TTY file, as we will // use it for the other stdio FDs. @@ -63,11 +68,12 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] } else { // Import the file as a regular host file. var err error - appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD.FD(), false /* isTTY */) if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) + _ = hostFD.Close() // FD is dup'd i ImportFile. } // Add the file to the FD map. @@ -82,8 +88,11 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] return ttyFile.FileOperations.(*host.TTYFileOperations), nil } -func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) { +func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []*fd.FD) (*hostvfs2.TTYFileDescription, error) { k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("cannot find kernel from context") + } var ttyFile *vfs.FileDescription for appFD, hostFD := range stdioFDs { @@ -93,11 +102,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */) + appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), true /* isTTY */) if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) + hostFD.Release() // FD is transfered to host FD. // Remember this in the TTY file, as we will use it for the other stdio // FDs. @@ -110,11 +120,12 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi } } else { var err error - appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */) + appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD.FD(), false /* isTTY */) if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) + hostFD.Release() // FD is transfered to host FD. } if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index ab1424c95..ff2fe6712 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -107,8 +107,7 @@ func copyUp(ctx context.Context, d *Dirent) error { // leave the upper filesystem filled with any number of parent directories // but the upper filesystem will never be in an inconsistent state. // -// Preconditions: -// - d.Inode.overlay is non-nil. +// Preconditions: d.Inode.overlay is non-nil. func copyUpLockedForRename(ctx context.Context, d *Dirent) error { for { // Did we race with another copy up or does there @@ -183,12 +182,12 @@ func doCopyUp(ctx context.Context, d *Dirent) error { // Returns a generic error on failure. // // Preconditions: -// - parent.Inode.overlay.upper must be non-nil. -// - next.Inode.overlay.copyMu must be locked writable. -// - next.Inode.overlay.lower must be non-nil. -// - next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory, +// * parent.Inode.overlay.upper must be non-nil. +// * next.Inode.overlay.copyMu must be locked writable. +// * next.Inode.overlay.lower must be non-nil. +// * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory, // or Symlink. -// - upper filesystem must support setting file ownership and timestamps. +// * upper filesystem must support setting file ownership and timestamps. func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { // Extract the attributes of the file we wish to copy. attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx) @@ -201,7 +200,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { parentUpper := parent.Inode.overlay.upper root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } // Create the file in the upper filesystem and get an Inode for it. @@ -212,7 +211,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { log.Warningf("copy up failed to create file: %v", err) return syserror.EIO } - defer childFile.DecRef() + defer childFile.DecRef(ctx) childUpperInode = childFile.Dirent.Inode case Directory: @@ -226,7 +225,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode case Symlink: @@ -246,7 +245,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode default: @@ -352,14 +351,14 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in if err != nil { return err } - defer upperFile.DecRef() + defer upperFile.DecRef(ctx) // Get a handle to the lower filesystem, which we will read from. lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true}) if err != nil { return err } - defer lowerFile.DecRef() + defer lowerFile.DecRef(ctx) // Use a buffer pool to minimize allocations. buf := copyUpBuffers.Get().([]byte) diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 91792d9fe..c7a11eec1 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -126,7 +126,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to create file %q: %v", name, err) } - defer f.DecRef() + defer f.DecRef(ctx) relname, _ := f.Dirent.FullName(lowerRoot) @@ -171,7 +171,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to find %q: %v", f.name, err) } - defer d.DecRef() + defer d.DecRef(ctx) f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 9379a4d7b..6b7b451b8 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/sentry/socket/netstack", "//pkg/syserror", "//pkg/tcpip/link/tun", + "//pkg/tcpip/network/arp", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index dc7ad075a..19ffdec47 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -15,6 +15,8 @@ package dev import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -25,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/link/tun" + "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -60,7 +63,7 @@ func newNetTunDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMod } // GetFile implements fs.InodeOperations.GetFile. -func (iops *netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { +func (*netTunInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { return fs.NewFile(ctx, d, flags, &netTunFileOperations{}), nil } @@ -80,21 +83,22 @@ type netTunFileOperations struct { var _ fs.FileOperations = (*netTunFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (fops *netTunFileOperations) Release() { - fops.device.Release() +func (n *netTunFileOperations) Release(ctx context.Context) { + n.device.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. -func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (n *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { request := args[1].Uint() data := args[2].Pointer() + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("Ioctl should be called from a task context") + } + switch request { case linux.TUNSETIFF: - t := kernel.TaskFromContext(ctx) - if t == nil { - panic("Ioctl should be called from a task context") - } if !t.HasCapability(linux.CAP_NET_ADMIN) { return 0, syserror.EPERM } @@ -104,27 +108,32 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u } var req linux.IFReq - if _, err := usermem.CopyObjectIn(ctx, io, data, &req, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := req.CopyIn(t, data); err != nil { return 0, err } flags := usermem.ByteOrder.Uint16(req.Data[:]) - return 0, fops.device.SetIff(stack.Stack, req.Name(), flags) + created, err := n.device.SetIff(stack.Stack, req.Name(), flags) + if err == nil && created { + // Always start with an ARP address for interfaces so they can handle ARP + // packets. + nicID := n.device.NICID() + if err := stack.Stack.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + panic(fmt.Sprintf("failed to add ARP address after creating new TUN/TAP interface with ID = %d", nicID)) + } + } + return 0, err case linux.TUNGETIFF: var req linux.IFReq - copy(req.IFName[:], fops.device.Name()) + copy(req.IFName[:], n.device.Name()) // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c. - flags := fops.device.Flags() | linux.IFF_NOFILTER + flags := n.device.Flags() | linux.IFF_NOFILTER usermem.ByteOrder.PutUint16(req.Data[:], flags) - _, err := usermem.CopyObjectOut(ctx, io, data, &req, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := req.CopyOut(t, data) return 0, err default: @@ -133,41 +142,41 @@ func (fops *netTunFileOperations) Ioctl(ctx context.Context, file *fs.File, io u } // Write implements fs.FileOperations.Write. -func (fops *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { +func (n *netTunFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { data := make([]byte, src.NumBytes()) if _, err := src.CopyIn(ctx, data); err != nil { return 0, err } - return fops.device.Write(data) + return n.device.Write(data) } // Read implements fs.FileOperations.Read. -func (fops *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - data, err := fops.device.Read() +func (n *netTunFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + data, err := n.device.Read() if err != nil { return 0, err } - n, err := dst.CopyOut(ctx, data) - if n > 0 && n < len(data) { + bytesCopied, err := dst.CopyOut(ctx, data) + if bytesCopied > 0 && bytesCopied < len(data) { // Not an error for partial copying. Packet truncated. err = nil } - return int64(n), err + return int64(bytesCopied), err } // Readiness implements watier.Waitable.Readiness. -func (fops *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { - return fops.device.Readiness(mask) +func (n *netTunFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return n.device.Readiness(mask) } // EventRegister implements watier.Waitable.EventRegister. -func (fops *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - fops.device.EventRegister(e, mask) +func (n *netTunFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + n.device.EventRegister(e, mask) } // EventUnregister implements watier.Waitable.EventUnregister. -func (fops *netTunFileOperations) EventUnregister(e *waiter.Entry) { - fops.device.EventUnregister(e) +func (n *netTunFileOperations) EventUnregister(e *waiter.Entry) { + n.device.EventUnregister(e) } // isNetTunSupported returns whether /dev/net/tun device is supported for s. diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 65be12175..00c526b03 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -325,7 +325,7 @@ func (d *Dirent) SyncAll(ctx context.Context) { for _, w := range d.children { if child := w.Get(); child != nil { child.(*Dirent).SyncAll(ctx) - child.DecRef() + child.DecRef(ctx) } } } @@ -413,9 +413,9 @@ func (d *Dirent) descendantOf(p *Dirent) bool { // Inode.Lookup, otherwise walk will keep d.mu locked. // // Preconditions: -// - renameMu must be held for reading. -// - d.mu must be held. -// - name must must not contain "/"s. +// * renameMu must be held for reading. +// * d.mu must be held. +// * name must must not contain "/"s. func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) { if !IsDir(d.Inode.StableAttr) { return nil, syscall.ENOTDIR @@ -451,7 +451,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // which don't hold a hard reference on their parent (their parent holds a // hard reference on them, and they contain virtually no state). But this is // good house-keeping. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -468,20 +468,20 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // their pins on the child. Inotify doesn't properly support filesystems that // revalidate dirents (since watches are lost on revalidation), but if we fail // to unpin the watches child will never be GCed. - cd.Inode.Watches.Unpin(cd) + cd.Inode.Watches.Unpin(ctx, cd) // This child needs to be revalidated, fallthrough to unhash it. Make sure // to not leak a reference from Get(). // // Note that previous lookups may still have a reference to this stale child; // this can't be helped, but we can ensure that *new* lookups are up-to-date. - child.DecRef() + child.DecRef(ctx) } // Either our weak reference expired or we need to revalidate it. Unhash child first, we're // about to replace it. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be @@ -512,12 +512,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // There are active references to the existing child, prefer it to the one we // retrieved from Lookup. Likely the Lookup happened very close to the insertion // of child, so considering one stale over the other is fairly arbitrary. - c.DecRef() + c.DecRef(ctx) // The child that was installed could be negative. if cd.IsNegative() { // If so, don't leak a reference and short circuit. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -531,7 +531,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child // we looked up. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Give the looked up child a parent. We cannot kick out entries, since we just checked above @@ -577,9 +577,9 @@ func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, // exists returns true if name exists in relation to d. // // Preconditions: -// - renameMu must be held for reading. -// - d.mu must be held. -// - name must must not contain "/"s. +// * renameMu must be held for reading. +// * d.mu must be held. +// * name must must not contain "/"s. func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { child, err := d.walk(ctx, root, name, false /* may unlock */) if err != nil { @@ -587,7 +587,7 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { return false } // Child exists. - child.DecRef() + child.DecRef(ctx) return true } @@ -622,7 +622,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi } child := file.Dirent - d.finishCreate(child, name) + d.finishCreate(ctx, child, name) // Return the reference and the new file. When the last reference to // the file is dropped, file.Dirent may no longer be cached. @@ -631,7 +631,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi // finishCreate validates the created file, adds it as a child of this dirent, // and notifies any watchers. -func (d *Dirent) finishCreate(child *Dirent, name string) { +func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // Sanity check c, its name must be consistent. if child.name != name { panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name)) @@ -650,14 +650,14 @@ func (d *Dirent) finishCreate(child *Dirent, name string) { panic(fmt.Sprintf("hashed child %q over a positive child", child.name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) @@ -686,17 +686,17 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Unhash the negative Dirent, name needs to exist now. delete(d.children, name) // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } // Execute the create operation. @@ -756,7 +756,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans if e != nil { return e } - d.finishCreate(childDir, name) + d.finishCreate(ctx, childDir, name) return nil }) if err == syscall.EEXIST { @@ -901,7 +901,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // references to children. // // Preconditions: d.mu must be held. -func (d *Dirent) flush() { +func (d *Dirent) flush(ctx context.Context) { expired := make(map[string]*refs.WeakRef) for n, w := range d.children { // Call flush recursively on each child before removing our @@ -912,7 +912,7 @@ func (d *Dirent) flush() { if !cd.IsNegative() { // Flush the child. cd.mu.Lock() - cd.flush() + cd.flush(ctx) cd.mu.Unlock() // Allow the file system to drop extra references on child. @@ -920,13 +920,13 @@ func (d *Dirent) flush() { } // Don't leak a reference. - child.DecRef() + child.DecRef(ctx) } // Check if the child dirent is closed, and mark it as expired if it is. // We must call w.Get() again here, since the child could have been closed // by the calls to flush() and cache.Remove() in the above if-block. if child := w.Get(); child != nil { - child.DecRef() + child.DecRef(ctx) } else { expired[n] = w } @@ -935,7 +935,7 @@ func (d *Dirent) flush() { // Remove expired entries. for n, w := range expired { delete(d.children, n) - w.Drop() + w.Drop(ctx) } } @@ -977,7 +977,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // Note that even though `d` is now hidden, it still holds a reference // to its parent. @@ -1002,13 +1002,13 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // d is not reachable anymore, and hence not mounted anymore. d.mounted = false // Drop mount reference. - d.DecRef() + d.DecRef(ctx) return nil } @@ -1029,7 +1029,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // Remove cannot remove directories. if IsDir(child.Inode.StableAttr) { @@ -1055,7 +1055,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1067,7 +1067,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // inode may have other links. If this was the last link, the events for the // watch removal will be queued by the inode destructor. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_DELETE, 0) return nil @@ -1100,7 +1100,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // RemoveDirectory can only remove directories. if !IsDir(child.Inode.StableAttr) { @@ -1121,7 +1121,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1130,14 +1130,14 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Finally, let inotify know the child is being unlinked. Drop any extra // refs from inotify to this child dirent. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0) return nil } // destroy closes this node and all children. -func (d *Dirent) destroy() { +func (d *Dirent) destroy(ctx context.Context) { if d.IsNegative() { // Nothing to tear-down and no parent references to drop, since a negative // Dirent does not take a references on its parent, has no Inode and no children. @@ -1153,19 +1153,19 @@ func (d *Dirent) destroy() { if c.(*Dirent).IsNegative() { // The parent holds both weak and strong refs in the case of // negative dirents. - c.DecRef() + c.DecRef(ctx) } // Drop the reference we just acquired in WeakRef.Get. - c.DecRef() + c.DecRef(ctx) } - w.Drop() + w.Drop(ctx) } d.children = nil allDirents.remove(d) // Drop our reference to the Inode. - d.Inode.DecRef() + d.Inode.DecRef(ctx) // Allow the Dirent to be GC'ed after this point, since the Inode may still // be referenced after the Dirent is destroyed (for instance by filesystem @@ -1175,7 +1175,7 @@ func (d *Dirent) destroy() { // Drop the reference we have on our parent if we took one. renameMu doesn't need to be // held because d can't be reparented without any references to it left. if d.parent != nil { - d.parent.DecRef() + d.parent.DecRef(ctx) } } @@ -1201,14 +1201,14 @@ func (d *Dirent) TryIncRef() bool { // DecRef decreases the Dirent's refcount and drops its reference on its mount. // // DecRef implements RefCounter.DecRef with destructor d.destroy. -func (d *Dirent) DecRef() { +func (d *Dirent) DecRef(ctx context.Context) { if d.Inode != nil { // Keep mount around, since DecRef may destroy d.Inode. msrc := d.Inode.MountSource - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) msrc.DecDirentRefs() } else { - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) } } @@ -1359,7 +1359,7 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error if err != nil { return err } - defer victim.DecRef() + defer victim.DecRef(ctx) return d.mayDelete(ctx, victim) } @@ -1411,7 +1411,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string if err != nil { return err } - defer renamed.DecRef() + defer renamed.DecRef(ctx) // Check that the renamed dirent is deletable. if err := oldParent.mayDelete(ctx, renamed); err != nil { @@ -1453,13 +1453,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that we can delete replaced. if err := newParent.mayDelete(ctx, replaced); err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } // Target should not be an ancestor of source. if oldParent.descendantOf(replaced) { - replaced.DecRef() + replaced.DecRef(ctx) // Note that Linux returns EINVAL if the source is an // ancestor of target, but ENOTEMPTY if the target is @@ -1470,7 +1470,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that replaced is not a mount point. if replaced.isMountPointLocked() { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EBUSY } @@ -1478,11 +1478,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string oldIsDir := IsDir(renamed.Inode.StableAttr) newIsDir := IsDir(replaced.Inode.StableAttr) if !newIsDir && oldIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.ENOTDIR } if !oldIsDir && newIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EISDIR } @@ -1493,13 +1493,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // open across renames is currently broken for multiple // reasons, so we flush all references on the replaced node and // its children. - replaced.Inode.Watches.Unpin(replaced) + replaced.Inode.Watches.Unpin(ctx, replaced) replaced.mu.Lock() - replaced.flush() + replaced.flush(ctx) replaced.mu.Unlock() // Done with replaced. - replaced.DecRef() + replaced.DecRef(ctx) } if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil { @@ -1513,14 +1513,14 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // can't destroy oldParent (and try to retake its lock) because // Rename's caller must be holding a reference. newParent.IncRef() - oldParent.DecRef() + oldParent.DecRef(ctx) } if w, ok := newParent.children[newName]; ok { - w.Drop() + w.Drop(ctx) delete(newParent.children, newName) } if w, ok := oldParent.children[oldName]; ok { - w.Drop() + w.Drop(ctx) delete(oldParent.children, oldName) } @@ -1551,7 +1551,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Same as replaced.flush above. renamed.mu.Lock() - renamed.flush() + renamed.flush(ctx) renamed.mu.Unlock() return nil diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 33de32c69..7d9dd717e 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -101,7 +102,7 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.DecRef() + d.DecRef(context.Background()) c.currentSize-- if c.limit != nil { c.limit.dec() diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 98d69c6f2..176b894ba 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -51,7 +51,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1) } - d.DecRef() + d.DecRef(ctx) if got := root.ReadRefs(); got != 1 { t.Fatalf("root has a ref count of %d, want %d", got, 1) @@ -61,7 +61,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0) } - root.flush() + root.flush(ctx) if got := len(root.children); got != 0 { t.Fatalf("root has %d children, want %d", got, 0) @@ -114,7 +114,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("child has a ref count of %d, want %d", got, 2) } - child.DecRef() + child.DecRef(ctx) if got := child.(*Dirent).ReadRefs(); got != 1 { t.Fatalf("child has a ref count of %d, want %d", got, 1) @@ -124,7 +124,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("root has %d children, want %d", got, 1) } - root.DecRef() + root.DecRef(ctx) if got := root.ReadRefs(); got != 0 { t.Fatalf("root has a ref count of %d, want %d", got, 0) @@ -351,9 +351,9 @@ func TestRemoveExtraRefs(t *testing.T) { t.Fatalf("dirent has a ref count of %d, want %d", got, 1) } - d.DecRef() + d.DecRef(ctx) - test.root.flush() + test.root.flush(ctx) if got := len(test.root.children); got != 0 { t.Errorf("root has %d children, want %d", got, 0) @@ -403,8 +403,8 @@ func TestRenameExtraRefs(t *testing.T) { t.Fatalf("Rename got error %v, want nil", err) } - oldParent.flush() - newParent.flush() + oldParent.flush(ctx) + newParent.flush(ctx) // Expect to have only active references. if got := renamed.ReadRefs(); got != 1 { diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go index f623d6c0e..67a35f0b2 100644 --- a/pkg/sentry/fs/dirent_state.go +++ b/pkg/sentry/fs/dirent_state.go @@ -18,6 +18,7 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" ) @@ -48,7 +49,7 @@ func (d *Dirent) saveChildren() map[string]*Dirent { for name, w := range d.children { if rc := w.Get(); rc != nil { // Drop the reference count obtain in w.Get() - rc.DecRef() + rc.DecRef(context.Background()) cd := rc.(*Dirent) if cd.IsNegative() { diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 9fce177ad..b99199798 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -115,7 +115,7 @@ func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.Even } // Release implements fs.FileOperations.Release. -func (p *pipeOperations) Release() { +func (p *pipeOperations) Release(context.Context) { fdnotifier.RemoveFD(int32(p.file.FD())) p.file.Close() p.file = nil diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index e556da48a..b9cec4b13 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -182,7 +182,7 @@ func TestTryOpen(t *testing.T) { // Cleanup the state of the pipe, and remove the fd from the // fdnotifier. Sadly this needed to maintain the correctness // of other tests because the fdnotifier is global. - pipeOps.Release() + pipeOps.Release(ctx) } continue } @@ -191,7 +191,7 @@ func TestTryOpen(t *testing.T) { } if pipeOps != nil { // Same as above. - pipeOps.Release() + pipeOps.Release(ctx) } } } @@ -279,7 +279,7 @@ func TestPipeOpenUnblocksEventually(t *testing.T) { pipeOps, err := Open(ctx, opener, flags) if pipeOps != nil { // Same as TestTryOpen. - pipeOps.Release() + pipeOps.Release(ctx) } // Check that the partner opened the file successfully. @@ -325,7 +325,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { ctx := contexttest.Context(t) pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true}) if pipeOps != nil { - pipeOps.Release() + pipeOps.Release(ctx) t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY) } if err != syserror.ErrWouldBlock { @@ -351,7 +351,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { if pipeOps == nil { t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY) } - defer pipeOps.Release() + defer pipeOps.Release(ctx) if err != nil { t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err) @@ -471,14 +471,14 @@ func TestPipeHangup(t *testing.T) { f := <-fdchan if f < 0 { t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f) - pipeOps.Release() + pipeOps.Release(ctx) continue } if test.hangupSelf { // Hangup self and assert that our partner got the expected hangup // error. - pipeOps.Release() + pipeOps.Release(ctx) if test.flags.Read { // Partner is writer. @@ -490,7 +490,7 @@ func TestPipeHangup(t *testing.T) { } else { // Hangup our partner and expect us to get the hangup error. syscall.Close(f) - defer pipeOps.Release() + defer pipeOps.Release(ctx) if test.flags.Read { assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file) diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index a0082ecca..1c9e82562 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -98,10 +98,11 @@ func TestNewPipe(t *testing.T) { } f := fd.New(gfd) - p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, test.flags, f, test.readAheadBuffer) if p != nil { // This is necessary to remove the fd from the global fd notifier. - defer p.Release() + defer p.Release(ctx) } else { // If there is no p to DecRef on, because newPipeOperations failed, then the // file still needs to be closed. @@ -153,13 +154,14 @@ func TestPipeDestruction(t *testing.T) { syscall.Close(fds[1]) // Test the read end, but it doesn't really matter which. - p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, f, nil) if err != nil { f.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } // Drop our only reference, which should trigger the destructor. - p.Release() + p.Release(ctx) if fdnotifier.HasFD(int32(fds[0])) { t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0]) @@ -282,7 +284,7 @@ func TestPipeRequest(t *testing.T) { if err != nil { t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) file := fs.NewFile(ctx, fs.NewDirent(ctx, inode, "pipe"), fs.FileFlags{Read: true}, p) @@ -334,7 +336,7 @@ func TestPipeReadAheadBuffer(t *testing.T) { rfile.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -380,7 +382,7 @@ func TestPipeReadsAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests will // likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -448,7 +450,7 @@ func TestPipeWritesAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests // will likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 846252c89..72ea70fcf 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -142,17 +142,17 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper } // DecRef destroys the File when it is no longer referenced. -func (f *File) DecRef() { - f.DecRefWithDestructor(func() { +func (f *File) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, func(context.Context) { // Drop BSD style locks. lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} - f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng) + f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) // Release resources held by the FileOperations. - f.FileOperations.Release() + f.FileOperations.Release(ctx) // Release a reference on the Dirent. - f.Dirent.DecRef() + f.Dirent.DecRef(ctx) // Only unregister if we are currently registered. There is nothing // to register if f.async is nil (this happens when async mode is @@ -310,7 +310,6 @@ func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error if !f.mu.Lock(ctx) { return 0, syserror.ErrInterrupted } - unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) // Handle append mode. if f.Flags().Append { @@ -355,7 +354,6 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 // offset." unlockAppendMu := f.Dirent.Inode.lockAppendMu(f.Flags().Append) defer unlockAppendMu() - if f.Flags().Append { if err := f.offsetForAppend(ctx, &offset); err != nil { return 0, err @@ -374,9 +372,10 @@ func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64 return f.FileOperations.Write(ctx, f, src, offset) } -// offsetForAppend sets the given offset to the end of the file. +// offsetForAppend atomically sets the given offset to the end of the file. // -// Precondition: the file.Dirent.Inode.appendMu mutex should be held for writing. +// Precondition: the file.Dirent.Inode.appendMu mutex should be held for +// writing. func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { uattr, err := f.Dirent.Inode.UnstableAttr(ctx) if err != nil { @@ -386,7 +385,7 @@ func (f *File) offsetForAppend(ctx context.Context, offset *int64) error { } // Update the offset. - *offset = uattr.Size + atomic.StoreInt64(offset, uattr.Size) return nil } @@ -461,7 +460,7 @@ func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { func (f *File) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := f.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index beba0f771..6ec721022 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -67,7 +67,7 @@ type SpliceOpts struct { // - File.Flags(): This value may change during the operation. type FileOperations interface { // Release release resources held by FileOperations. - Release() + Release(ctx context.Context) // Waitable defines how this File can be waited on for read and // write readiness. @@ -159,7 +159,9 @@ type FileOperations interface { // io provides access to the virtual memory space to which pointers in args // refer. // - // Preconditions: The AddressSpace (if any) that io refers to is activated. + // Preconditions: + // * The AddressSpace (if any) that io refers to is activated. + // * Must only be called from a task goroutine. Ioctl(ctx context.Context, file *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) } diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index dcc1df38f..9dc58d5ff 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -54,7 +54,7 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err // Drop the extra reference on the Dirent. Now there's only one reference // on the dirent, either owned by f (if non-nil), or the Dirent is about // to be destroyed (if GetFile failed). - dirent.DecRef() + dirent.DecRef(ctx) return f, err } @@ -89,12 +89,12 @@ type overlayFileOperations struct { } // Release implements FileOperations.Release. -func (f *overlayFileOperations) Release() { +func (f *overlayFileOperations) Release(ctx context.Context) { if f.upper != nil { - f.upper.DecRef() + f.upper.DecRef(ctx) } if f.lower != nil { - f.lower.DecRef() + f.lower.DecRef(ctx) } } @@ -164,7 +164,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &DirCtx{ @@ -497,7 +497,7 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) { if err != nil { return nil, err } - defer dir.DecRef() + defer dir.DecRef(ctx) // Use a stub serializer to read the entries into memory. stubSerializer := &CollectEntriesSerializer{} @@ -521,10 +521,10 @@ type overlayMappingIdentity struct { } // DecRef implements AtomicRefCount.DecRef. -func (omi *overlayMappingIdentity) DecRef() { - omi.AtomicRefCount.DecRefWithDestructor(func() { - omi.overlayFile.DecRef() - omi.id.DecRef() +func (omi *overlayMappingIdentity) DecRef(ctx context.Context) { + omi.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + omi.overlayFile.DecRef(ctx) + omi.id.DecRef(ctx) }) } @@ -544,7 +544,7 @@ func (omi *overlayMappingIdentity) InodeID() uint64 { func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := omi.overlayFile.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go index 084da2a8d..d41f30bbb 100644 --- a/pkg/sentry/fs/filesystems.go +++ b/pkg/sentry/fs/filesystems.go @@ -87,20 +87,6 @@ func RegisterFilesystem(f Filesystem) { filesystems.registered[f.Name()] = f } -// UnregisterFilesystem removes a file system from the global set. To keep the -// file system set compatible with save/restore, UnregisterFilesystem must be -// called before save/restore methods. -// -// For instance, packages may unregister their file system after it is mounted. -// This makes sense for pseudo file systems that should not be visible or -// mountable. See whitelistfs in fs/host/fs.go for one example. -func UnregisterFilesystem(name string) { - filesystems.mu.Lock() - defer filesystems.mu.Unlock() - - delete(filesystems.registered, name) -} - // FindFilesystem returns a Filesystem registered at name or (nil, false) if name // is not a file system type that can be found in /proc/filesystems. func FindFilesystem(name string) (Filesystem, bool) { diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index bdba6efe5..d2dbff268 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -42,9 +42,10 @@ // Dirent.dirMu // Dirent.mu // DirentCache.mu -// Locks in InodeOperations implementations or overlayEntry // Inode.Watches.mu (see `Inotify` for other lock ordering) // MountSource.mu +// Inode.appendMu +// Locks in InodeOperations implementations or overlayEntry // // If multiple Dirent or MountSource locks must be taken, locks in the parent must be // taken before locks in their children. diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 789369220..5fb419bcd 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -8,7 +8,6 @@ go_template_instance( out = "dirty_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "Dirty", @@ -25,14 +24,14 @@ go_template_instance( name = "frame_ref_set_impl", out = "frame_ref_set_impl.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "fsutil", prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "uint64", "Functions": "FrameRefSetFunctions", }, @@ -43,7 +42,6 @@ go_template_instance( out = "file_range_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "FileRange", @@ -86,7 +84,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/state", diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index c6cd45087..2c9446c1d 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { // repeatedly until all bytes have been written. max is the true size of the // cached object; offsets beyond max will not be passed to writeAt, even if // they are marked dirty. -func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { var changedDirty bool defer func() { if changedDirty { @@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet // successful partial write, SyncDirtyAll will call it repeatedly until all // bytes have been written. max is the true size of the cached object; offsets // beyond max will not be passed to writeAt, even if they are marked dirty. -func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { dseg := dirty.FirstSegment() for dseg.Ok() { if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { @@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max } // Preconditions: mr must be page-aligned. -func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { wbr := cseg.Range().Intersect(mr) if max < wbr.Start { diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index 08695391c..dc9efa5df 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -31,7 +31,7 @@ import ( type FileNoopRelease struct{} // Release is a no-op. -func (FileNoopRelease) Release() {} +func (FileNoopRelease) Release(context.Context) {} // SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor // is not nil and the seek was on a directory, the cursor will be updated. @@ -296,7 +296,7 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Diren func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index 5643cdac9..1dc409d38 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -23,13 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/usermem" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a -// platform.File. It is used to implement Mappables that store data in +// memmap.File. It is used to implement Mappables that store data in // sparsely-allocated memory. // // type FileRangeSet <generated by go_generics> @@ -65,33 +64,38 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli } // FileRange returns the FileRange mapped by seg. -func (seg FileRangeIterator) FileRange() platform.FileRange { +func (seg FileRangeIterator) FileRange() memmap.FileRange { return seg.FileRangeOf(seg.Range()) } // FileRangeOf returns the FileRange mapped by mr. // -// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. -func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { +// Preconditions: +// * seg.Range().IsSupersetOf(mr). +// * mr.Length() != 0. +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange { frstart := seg.Value() + (mr.Start - seg.Start()) - return platform.FileRange{frstart, frstart + mr.Length()} + return memmap.FileRange{frstart, frstart + mr.Length()} } // Fill attempts to ensure that all memmap.Mappable offsets in required are -// mapped to a platform.File offset, by allocating from mf with the given +// mapped to a memmap.File offset, by allocating from mf with the given // memory usage kind and invoking readAt to store data into memory. (If readAt // returns a successful partial read, Fill will call it repeatedly until all // bytes have been read.) EOF is handled consistently with the requirements of // mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are -// invalid. +// invalid. fileSize is an upper bound on the file's size; bytes after fileSize +// will be zeroed without calling readAt. // // Fill may read offsets outside of required, but will never read offsets // outside of optional. It returns a non-nil error if any error occurs, even // if the error only affects offsets in optional, but not in required. // -// Preconditions: required.Length() > 0. optional.IsSupersetOf(required). -// required and optional must be page-aligned. -func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error { +// Preconditions: +// * required.Length() > 0. +// * optional.IsSupersetOf(required). +// * required and optional must be page-aligned. +func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error { gap := frs.LowerBoundGap(required.Start) for gap.Ok() && gap.Start() < required.End { if gap.Range().Length() == 0 { @@ -104,7 +108,21 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { var done uint64 for !dsts.IsEmpty() { - n, err := readAt(ctx, dsts, gr.Start+done) + n, err := func() (uint64, error) { + off := gr.Start + done + if off >= fileSize { + return 0, io.EOF + } + if off+dsts.NumBytes() > fileSize { + rd := fileSize - off + n, err := readAt(ctx, dsts.TakeFirst64(rd), off) + if n == rd && err == nil { + return n, io.EOF + } + return n, err + } + return readAt(ctx, dsts, off) + }() done += n dsts = dsts.DropFirst64(n) if err != nil { @@ -141,7 +159,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map } // Drop removes segments for memmap.Mappable offsets in mr, freeing the -// corresponding platform.FileRanges. +// corresponding memmap.FileRanges. // // Preconditions: mr must be page-aligned. func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { @@ -154,7 +172,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { } // DropAll removes all segments in mr, freeing the corresponding -// platform.FileRanges. +// memmap.FileRanges. func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) { for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { mf.DecRef(seg.FileRange()) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd6f5aba6..a808894df 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -17,7 +17,7 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" ) @@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } // IncRefAndAccount adds a reference on the range fr. All newly inserted segments // are accounted as host page cache memory mappings. -func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) { seg, gap := refs.Find(fr.Start) for { switch { @@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { // DecRefAndAccount removes a reference on the range fr and untracks segments // that are removed from memory accounting. -func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) { seg := refs.FindSegment(fr.Start) for seg.Ok() && seg.Start() < fr.End { diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index e82afd112..4468f5dd2 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -71,6 +70,13 @@ func (f *HostFileMapper) Init() { f.mappings = make(map[uint64]mapping) } +// IsInited returns true if f.Init() has been called. This is used when +// restoring a checkpoint that contains a HostFileMapper that may or may not +// have been initialized. +func (f *HostFileMapper) IsInited() bool { + return f.refs != nil +} + // NewHostFileMapper returns an initialized HostFileMapper allocated on the // heap with no references or cached mappings. func NewHostFileMapper() *HostFileMapper { @@ -81,7 +87,9 @@ func NewHostFileMapper() *HostFileMapper { // IncRefOn increments the reference count on all offsets in mr. // -// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +// Preconditions: +// * mr.Length() != 0. +// * mr.Start and mr.End must be page-aligned. func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() @@ -98,7 +106,9 @@ func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { // DecRefOn decrements the reference count on all offsets in mr. // -// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +// Preconditions: +// * mr.Length() != 0. +// * mr.Start and mr.End must be page-aligned. func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { f.refsMu.Lock() defer f.refsMu.Unlock() @@ -126,7 +136,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { // offsets in fr or until the next call to UnmapAll. // // Preconditions: The caller must hold a reference on all offsets in fr. -func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { +func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) { chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) f.mapsMu.Lock() defer f.mapsMu.Unlock() @@ -146,7 +156,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) } // Preconditions: f.mapsMu must be locked. -func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { +func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error { prot := syscall.PROT_READ if write { prot |= syscall.PROT_WRITE @@ -205,7 +215,9 @@ func (f *HostFileMapper) UnmapAll() { } } -// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m. +// Preconditions: +// * f.mapsMu must be locked. +// * f.mappings[chunkStart] == m. func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) { if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 { // This leaks address space and is unexpected, but is otherwise diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index 78fec553e..c15d8a946 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -21,18 +21,17 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// HostMappable implements memmap.Mappable and platform.File over a +// HostMappable implements memmap.Mappable and memmap.File over a // CachedFileObject. // // Lock order (compare the lock order model in mm/mm.go): // truncateMu ("fs locks") // mu ("memmap.Mappable locks not taken by Translate") -// ("platform.File locks") +// ("memmap.File locks") // backingFile ("CachedFileObject locks") // // +stateify savable @@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error { return nil } -// MapInternal implements platform.File.MapInternal. -func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (h *HostMappable) FD() int { return h.backingFile.FD() } -// IncRef implements platform.File.IncRef. -func (h *HostMappable) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (h *HostMappable) IncRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.IncRefOn(mr) } -// DecRef implements platform.File.DecRef. -func (h *HostMappable) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (h *HostMappable) DecRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.DecRefOn(mr) } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 800c8b4e1..82eda3e43 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -22,11 +22,9 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -445,7 +443,7 @@ func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs. // time. // // Preconditions: c.attrMu is locked for writing. -func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) { +func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) { c.attr.AccessTime = now c.dirtyAttr.AccessTime = true } @@ -462,7 +460,7 @@ func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx contex // and status change times in-place to the current time. // // Preconditions: c.attrMu is locked for writing. -func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) { +func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) { c.attr.ModificationTime = now c.dirtyAttr.ModificationTime = true c.attr.StatusChangeTime = now @@ -481,7 +479,7 @@ func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) { // in-place to the current time. // // Preconditions: c.attrMu is locked for writing. -func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) { +func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) { c.attr.StatusChangeTime = now c.dirtyAttr.StatusChangeTime = true } @@ -646,7 +644,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { End: fs.OffsetPageEnd(int64(gapMR.End)), } optMR := gap.Range() - err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt) + err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt) mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.c.cache.Find(uint64(rw.offset)) if !seg.Ok() { @@ -673,9 +671,6 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { // Continue. seg, gap = gap.NextSegment(), FileRangeGapIterator{} } - - default: - break } } unlock() @@ -685,7 +680,9 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { // maybeGrowFile grows the file's size if data has been written past the old // size. // -// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked. +// Preconditions: +// * rw.c.attrMu must be locked. +// * rw.c.dataMu must be locked. func (rw *inodeReadWriter) maybeGrowFile() { // If the write ends beyond the file's previous size, it causes the // file to grow. @@ -767,9 +764,6 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error // Continue. seg, gap = gap.NextSegment(), FileRangeGapIterator{} - - default: - break } } rw.maybeGrowFile() @@ -876,7 +870,7 @@ func (c *CachingInodeOperations) Translate(ctx context.Context, required, option } mf := c.mfp.MemoryFile() - cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt) + cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 @@ -934,7 +928,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. c.mapsMu.Lock() @@ -999,10 +993,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable } } -// IncRef implements platform.File.IncRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// IncRef implements memmap.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { +func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg, gap := c.refs.Find(fr.Start) @@ -1024,10 +1018,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { } } -// DecRef implements platform.File.DecRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// DecRef implements memmap.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { +func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg := c.refs.FindSegment(fr.Start) @@ -1046,15 +1040,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { c.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. This is used when we +// MapInternal implements memmap.File.MapInternal. This is used when we // directly map an underlying host fd and CachingInodeOperations is used as the -// platform.File during translation. -func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// memmap.File during translation. +func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// FD implements memmap.File.FD. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. func (c *CachingInodeOperations) FD() int { return c.backingFile.FD() diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md index b43c082a7..05e043583 100644 --- a/pkg/sentry/fs/g3doc/fuse.md +++ b/pkg/sentry/fs/g3doc/fuse.md @@ -76,9 +76,10 @@ ops can be implemented in parallel. #### Minimal client that can mount a trivial FUSE filesystem. -- Implement `/dev/fuse`. +- Implement `/dev/fuse` - a character device used to establish an FD for + communication between the sentry and the server daemon. -- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. +- Implement basic FUSE ops like `FUSE_INIT`. #### Read-only mount with basic file operations @@ -94,6 +95,103 @@ ops can be implemented in parallel. - Implement the remaining FUSE ops and decide if we can omit rarely used operations like ioctl. +### Design Details + +#### Lifecycle for a FUSE Request + +- User invokes a syscall +- Sentry prepares corresponding request + - If FUSE device is available + - Write the request in binary + - If FUSE device is full + - Kernel task blocked until available +- Sentry notifies the readers of fuse device that it's ready for read +- FUSE daemon reads the request and processes it +- Sentry waits until a reply is written to the FUSE device + - but returns directly for async requests +- FUSE daemon writes to the fuse device +- Sentry processes the reply + - For sync requests, unblock blocked kernel task + - For async requests, execute pre-specified callback if any +- Sentry returns the syscall to the user + +#### Channels and Queues for Requests in Different Stages + +`connection.initializedChan` + +- a channel that the requests issued before connection initialization blocks + on. + +`fd.queue` + +- a queue of requests that haven’t been read by the FUSE daemon yet. + +`fd.completions` + +- a map of the requests that have been prepared but not yet received a + response, including the ones on the `fd.queue`. + +`fd.waitQueue` + +- a queue of waiters that is waiting for the fuse device fd to be available, + such as the FUSE daemon. + +`fd.fullQueueCh` + +- a channel that the kernel task will be blocked on when the fd is not + available. + +#### Basic I/O Implementation + +Currently we have implemented basic functionalities of read and write for our +FUSE. We describe the design and ways to improve it here: + +##### Basic FUSE Read + +The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and +`vfs.FileDescriptionImpl.PRead()`. When a syscall is made, it will eventually +reach our implementation of those interface functions located at +`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files. + +After validation checks of the input, sentry sends `FUSE_READ` requests to the +FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the +responses. For the first version, we create a copy in kernel memory of those +data. They are represented as a byte slice in the marshalled struct. This +happens as a common process for all the FUSE responses at this moment at +`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this +intermediate buffer to the input buffer provided by the read syscall. + +There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter +or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are +the upperbound of the bytes to read in each `FUSE_READ` request. We implemented +the code to handle the fragmented reads. + +To improve the performance: ideally we should have buffer cache to copy those +data from the responses of FUSE daemon into, as is also the design of several +other existing file system implementations for sentry, instead of a single-use +temporary buffer. Directly mapping the memory of one process to another could +also boost the performance, but to keep them isolated, we did not choose to do +so. + +##### Basic FUSE Write + +The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and +`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a +user makes write(2) and pwrite(2) syscall. + +For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header +(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first +version, we allocate a buffer inside kernel memory to store the bytes from the +user, and copy directly from that buffer to the memory of FUSE daemon. This +happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()` + +The parameters `max_write` and `max_pages` restrict the number of bytes in one +`FUSE_WRITE`. There are code handling fragmented writes in current +implementation. + +To have better performance: the extra copy created to store the bytes to write +can be replaced by the buffer cache as well. + # Appendix ## FUSE Protocol diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index b2fcab127..c0bc63a32 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -114,7 +114,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF } // Release implements fs.FileOpeations.Release. -func (f *fileOperations) Release() { +func (f *fileOperations) Release(context.Context) { f.handles.DecRef() } @@ -122,7 +122,7 @@ func (f *fileOperations) Release() { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 2df2fe889..326fed954 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -232,7 +232,7 @@ func TestRevalidation(t *testing.T) { // We must release the dirent, of the test will fail // with a reference leak. This is tracked by p9test. - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Walk again. Depending on the cache policy, we may // get a new dirent. @@ -246,7 +246,7 @@ func TestRevalidation(t *testing.T) { if !test.preModificationWantReload && dirent != newDirent { t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Modify the underlying mocked file's modification // time for the next walk that occurs. @@ -287,7 +287,7 @@ func TestRevalidation(t *testing.T) { if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds { t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Remove the file from the remote fs, subsequent walks // should now fail to find anything. @@ -303,7 +303,7 @@ func TestRevalidation(t *testing.T) { t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err) } if err == nil { - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. } }) } diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index fc14249be..f324dbf26 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -47,7 +47,8 @@ type handles struct { // DecRef drops a reference on handles. func (h *handles) DecRef() { - h.DecRefWithDestructor(func() { + ctx := context.Background() + h.DecRefWithDestructor(ctx, func(context.Context) { if h.Host != nil { if h.isHostBorrowed { h.Host.Release() @@ -57,7 +58,7 @@ func (h *handles) DecRef() { } } } - if err := h.File.close(context.Background()); err != nil { + if err := h.File.close(ctx); err != nil { log.Warningf("error closing p9 file: %v", err) } }) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index a016c896e..3a225fd39 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -441,8 +441,9 @@ func (i *inodeOperations) Release(ctx context.Context) { // asynchronously. // // We use AsyncWithContext to avoid needing to allocate an extra - // anonymous function on the heap. - fs.AsyncWithContext(ctx, i.fileState.Release) + // anonymous function on the heap. We must use background context + // because the async work cannot happen on the task context. + fs.AsyncWithContext(context.Background(), i.fileState.Release) } // Mappable implements fs.InodeOperations.Mappable. @@ -640,7 +641,7 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, // WriteOut implements fs.InodeOperations.WriteOut. func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { - if !i.session().cachePolicy.cacheUAttrs(inode) { + if inode.MountSource.Flags.ReadOnly || !i.session().cachePolicy.cacheUAttrs(inode) { return nil } diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index cf9800100..3c66dc3c2 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -168,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string // Construct the positive Dirent. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - defer d.DecRef() + defer d.DecRef(ctx) // Construct the new file, caching the handles if allowed. h := handles{ @@ -371,7 +371,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { - defer d.DecRef() + defer d.DecRef(ctx) if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) { switch iops := d.Inode.InodeOperations.(type) { @@ -392,7 +392,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(*key) + i.session().overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index b5efc86f2..7cf3522ff 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -89,10 +89,10 @@ func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *f // remove deletes the key from the maps. // // Precondition: maps must have been locked with 'lock'. -func (e *overrideMaps) remove(key device.MultiDeviceKey) { +func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { endpoint := e.keyMap[key] delete(e.keyMap, key) - endpoint.dirent.DecRef() + endpoint.dirent.DecRef(ctx) } // lock blocks other addition and removal operations from happening while @@ -197,7 +197,7 @@ type session struct { } // Destroy tears down the session. -func (s *session) Destroy() { +func (s *session) Destroy(ctx context.Context) { s.client.Close() } @@ -329,7 +329,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF s.client, err = p9.NewClient(conn, s.msize, s.version) if err != nil { // Drop our reference on the session, it needs to be torn down. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -340,7 +340,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF ctx.UninterruptibleSleepFinish(false) if err != nil { // Same as above. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -348,7 +348,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF if err != nil { s.attach.close(ctx) // Same as above, but after we execute the Close request. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -393,13 +393,13 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. -func (s *session) fillPathMap() error { +func (s *session) fillPathMap(ctx context.Context) error { unlock := s.overrides.lock() defer unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() - defer mountRoot.DecRef() + defer mountRoot.DecRef(ctx) dirPath, _ := endpoint.dirent.FullName(mountRoot) if dirPath == "" { return fmt.Errorf("error getting path from dirent") diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 2d398b753..48b423dd8 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -26,7 +26,8 @@ import ( // beforeSave is invoked by stateify. func (s *session) beforeSave() { if s.overrides != nil { - if err := s.fillPathMap(); err != nil { + ctx := &dummyClockContext{context.Background()} + if err := s.fillPathMap(ctx); err != nil { panic("failed to save paths to override map before saving" + err.Error()) } } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 40f2c1cad..8a1c69ac2 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -134,14 +134,14 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.inode.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.inode.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index aabce6cc9..1368014c4 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -30,7 +30,9 @@ go_library( "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", + "//pkg/iovec", "//pkg/log", + "//pkg/marshal/primitive", "//pkg/refs", "//pkg/safemem", "//pkg/secio", diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 39299b7e4..0d8d36afa 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -57,7 +57,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 3e48b8b2c..86d1a87f0 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -110,7 +110,7 @@ func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) dirent := fs.NewDirent(ctx, inode, name) - defer dirent.DecRef() + defer dirent.DecRef(ctx) if isTTY { return newTTYFile(ctx, dirent, flags, iops), nil @@ -169,7 +169,7 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 62f1246aa..fbfba1b58 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -368,6 +368,9 @@ func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, // WriteOut implements fs.InodeOperations.WriteOut. func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + if inode.MountSource.Flags.ReadOnly { + return nil + } // Have we been using host kernel metadata caches? if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { // Then the metadata is already up to date on the host. diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index c507f57eb..41a23b5da 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -36,7 +36,7 @@ func TestCloseFD(t *testing.T) { if err != nil { t.Fatalf("Failed to create File: %v", err) } - file.DecRef() + file.DecRef(ctx) s := make([]byte, 10) if c, err := syscall.Read(p[0], s); c != 0 || err != nil { diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index b6e94583e..a2f3d5918 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -39,11 +38,6 @@ import ( // LINT.IfChange -// maxSendBufferSize is the maximum host send buffer size allowed for endpoint. -// -// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). -const maxSendBufferSize = 8 << 20 - // ConnectedEndpoint is a host FD backed implementation of // transport.ConnectedEndpoint and transport.Receiver. // @@ -103,10 +97,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error { if err != nil { return syserr.FromError(err) } - if sndbuf > maxSendBufferSize { - log.Warningf("Socket send buffer too large: %d", sndbuf) - return syserr.ErrInvalidEndpointState - } c.stype = linux.SockType(stype) c.sndbuf = int64(sndbuf) @@ -204,7 +194,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -281,7 +271,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -328,7 +318,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek } // close releases all resources related to the endpoint. -func (c *ConnectedEndpoint) close() { +func (c *ConnectedEndpoint) close(context.Context) { fdnotifier.RemoveFD(int32(c.file.FD())) c.file.Close() c.file = nil @@ -384,8 +374,8 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { } // Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(c.close) +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, c.close) } // CloseUnread implements transport.ConnectedEndpoint.CloseUnread. diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go index 5c18dbd5e..905afb50d 100644 --- a/pkg/sentry/fs/host/socket_iovec.go +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -17,15 +17,12 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/iovec" "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange -// maxIovs is the maximum number of iovecs to pass to the host. -var maxIovs = linux.UIO_MAXIOV - // copyToMulti copies as many bytes from src to dst as possible. func copyToMulti(dst [][]byte, src []byte) { for _, d := range dst { @@ -76,7 +73,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec } } - if iovsRequired > maxIovs { + if iovsRequired > iovec.MaxIovs { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index affdbcacb..9d58ea448 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -67,11 +67,12 @@ func TestSocketIsBlocking(t *testing.T) { if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { t.Fatalf("Expected socket %v to be blocking", pair[1]) } - sock, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sock, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) failed => %v", pair[0], err) } - defer sock.DecRef() + defer sock.DecRef(ctx) // Test that the socket now is non-blocking. if fl, err = getFl(pair[0]); err != nil { t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) @@ -93,11 +94,12 @@ func TestSocketWritev(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) buf := []byte("hello world\n") n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf)) if err != nil { @@ -115,11 +117,12 @@ func TestSocketWritevLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil)) if err != nil { t.Fatalf("socket writev failed: %v", err) @@ -136,11 +139,12 @@ func TestSocketSendMsgLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - sfile, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile.DecRef() + defer sfile.DecRef(ctx) s := sfile.FileOperations.(socket.Socket) n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{}) @@ -158,18 +162,19 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile1, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile1, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile1.DecRef() + defer sfile1.DecRef(ctx) socket1 := sfile1.FileOperations.(socket.Socket) - sfile2, err := newSocket(contexttest.Context(t), pair[1], false) + sfile2, err := newSocket(ctx, pair[1], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[1], err) } - defer sfile2.DecRef() + defer sfile2.DecRef(ctx) socket2 := sfile2.FileOperations.(socket.Socket) // Socketpairs can not be listened to. @@ -185,11 +190,11 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile3, err := newSocket(contexttest.Context(t), sock, false) + sfile3, err := newSocket(ctx, sock, false) if err != nil { t.Fatalf("newSocket(%v) => %v", sock, err) } - defer sfile3.DecRef() + defer sfile3.DecRef(ctx) socket3 := sfile3.FileOperations.(socket.Socket) // This socket is not bound so we can't listen on it. @@ -237,9 +242,10 @@ func TestRelease(t *testing.T) { } c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} want := &ConnectedEndpoint{queue: c.queue} - want.ref.DecRef() + ctx := contexttest.Context(t) + want.ref.DecRef(ctx) fdnotifier.AddFD(int32(c.file.FD()), nil) - c.Release() + c.Release(ctx) if !reflect.DeepEqual(c, want) { t.Errorf("got = %#v, want = %#v", c, want) } diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go index 5d4f312cf..c8231e0aa 100644 --- a/pkg/sentry/fs/host/socket_unsafe.go +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -65,10 +65,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, err + return length, n, msg.Controllen, controlTrunc, nil } - return n, n, msg.Controllen, controlTrunc, err + return n, n, msg.Controllen, controlTrunc, nil } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index cb91355ab..1183727ab 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -17,6 +17,7 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -53,7 +54,7 @@ type TTYFileOperations struct { func newTTYFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations) *fs.File { return fs.NewFile(ctx, dirent, flags, &TTYFileOperations{ fileOperations: fileOperations{iops: iops}, - termios: linux.DefaultSlaveTermios, + termios: linux.DefaultReplicaTermios, }) } @@ -113,16 +114,21 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme } // Release implements fs.FileOperations.Release. -func (t *TTYFileOperations) Release() { +func (t *TTYFileOperations) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileOperations.Release() + t.fileOperations.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + // Ignore arg[0]. This is the real FD: fd := t.fileOperations.iops.fileState.FD() ioctl := args[1].Uint64() @@ -132,9 +138,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO if err != nil { return 0, err } - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err = termios.CopyOut(task, args[2].Pointer()) return 0, err case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: @@ -146,9 +150,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO } var termios linux.Termios - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } err := ioctlSetTermios(fd, ioctl, &termios) @@ -173,10 +175,8 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // Map the ProcessGroup into a ProcessGroupID in the task's PID // namespace. - pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ - AddressSpaceActive: true, - }) + pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup)) + _, err := pgID.CopyOut(task, args[2].Pointer()) return 0, err case linux.TIOCSPGRP: @@ -184,11 +184,6 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // Equivalent to tcsetpgrp(fd, *argp). // Set the foreground process group ID of this terminal. - task := kernel.TaskFromContext(ctx) - if task == nil { - return 0, syserror.ENOTTY - } - t.mu.Lock() defer t.mu.Unlock() @@ -208,12 +203,11 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO return 0, syserror.ENOTTY } - var pgID kernel.ProcessGroupID - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + var pgIDP primitive.Int32 + if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } + pgID := kernel.ProcessGroupID(pgIDP) // pgID must be non-negative. if pgID < 0 { @@ -242,9 +236,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO if err != nil { return 0, err } - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err = winsize.CopyOut(task, args[2].Pointer()) return 0, err case linux.TIOCSWINSZ: @@ -255,9 +247,7 @@ func (t *TTYFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO // background ones) can set the winsize. var winsize linux.Winsize - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } err := ioctlSetWinsize(fd, &winsize) @@ -308,9 +298,9 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but - // tty_check_change is more of a blacklist of cases than a - // whitelist, and is surprisingly permissive. Allowing the - // change seems most appropriate. + // tty_check_change only blocks specific cases and is + // surprisingly permissive. Allowing the change seems + // appropriate. return nil } @@ -358,7 +348,7 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return kernel.ERESTARTSYS + return syserror.ERESTARTSYS } // LINT.ThenChange(../../fsimpl/host/tty.go) diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index ce397a5e3..c143f4ce2 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -39,7 +39,7 @@ func TestWait(t *testing.T) { t.Fatalf("NewFile failed: %v", err) } - defer file.DecRef() + defer file.DecRef(ctx) r := file.Readiness(waiter.EventIn) if r != 0 { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index a34fbc946..004910453 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -96,13 +96,12 @@ func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, satt } // DecRef drops a reference on the Inode. -func (i *Inode) DecRef() { - i.DecRefWithDestructor(i.destroy) +func (i *Inode) DecRef(ctx context.Context) { + i.DecRefWithDestructor(ctx, i.destroy) } // destroy releases the Inode and releases the msrc reference taken. -func (i *Inode) destroy() { - ctx := context.Background() +func (i *Inode) destroy(ctx context.Context) { if err := i.WriteOut(ctx); err != nil { // FIXME(b/65209558): Mark as warning again once noatime is // properly supported. @@ -122,12 +121,12 @@ func (i *Inode) destroy() { i.Watches.targetDestroyed() if i.overlay != nil { - i.overlay.release() + i.overlay.release(ctx) } else { i.InodeOperations.Release(ctx) } - i.MountSource.DecRef() + i.MountSource.DecRef(ctx) } // Mappable calls i.InodeOperations.Mappable. @@ -271,7 +270,7 @@ func (i *Inode) GetXattr(ctx context.Context, name string, size uint64) (string, // SetXattr calls i.InodeOperations.SetXattr with i as the Inode. func (i *Inode) SetXattr(ctx context.Context, d *Dirent, name, value string, flags uint32) error { if i.overlay != nil { - return overlaySetxattr(ctx, i.overlay, d, name, value, flags) + return overlaySetXattr(ctx, i.overlay, d, name, value, flags) } return i.InodeOperations.SetXattr(ctx, i, name, value, flags) } diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go index efd3c962b..9911a00c2 100644 --- a/pkg/sentry/fs/inode_inotify.go +++ b/pkg/sentry/fs/inode_inotify.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -136,11 +137,11 @@ func (w *Watches) Notify(name string, events, cookie uint32) { } // Unpin unpins dirent from all watches in this set. -func (w *Watches) Unpin(d *Dirent) { +func (w *Watches) Unpin(ctx context.Context, d *Dirent) { w.mu.RLock() defer w.mu.RUnlock() for _, watch := range w.ws { - watch.Unpin(d) + watch.Unpin(ctx, d) } } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 537c8d257..b16ab08ba 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -16,7 +16,6 @@ package fs import ( "fmt" - "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -85,7 +84,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name upperInode = child.Inode upperInode.IncRef() } - child.DecRef() + child.DecRef(ctx) } // Are we done? @@ -108,7 +107,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name entry, err := newOverlayEntry(ctx, upperInode, nil, false) if err != nil { // Don't leak resources. - upperInode.DecRef() + upperInode.DecRef(ctx) parent.copyMu.RUnlock() return nil, false, err } @@ -129,7 +128,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name if err != nil && err != syserror.ENOENT { // Don't leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -152,7 +151,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name } } } - child.DecRef() + child.DecRef(ctx) } } @@ -183,7 +182,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // unnecessary because we don't need to copy-up and we will always // operate (e.g. read/write) on the upper Inode. if !IsDir(upperInode.StableAttr) { - lowerInode.DecRef() + lowerInode.DecRef(ctx) lowerInode = nil } } @@ -194,10 +193,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Well, not quite, we failed at the last moment, how depressing. // Be sure not to leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } if lowerInode != nil { - lowerInode.DecRef() + lowerInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -248,7 +247,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // user) will clobber the real path for the underlying Inode. upperFile.Dirent.Inode.IncRef() upperDirent := NewTransientDirent(upperFile.Dirent.Inode) - upperFile.Dirent.DecRef() + upperFile.Dirent.DecRef(ctx) upperFile.Dirent = upperDirent // Create the overlay inode and dirent. We need this to construct the @@ -259,7 +258,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // The overlay file created below with NewFile will take a reference on // the overlayDirent, and it should be the only thing holding a // reference at the time of creation, so we must drop this reference. - defer overlayDirent.DecRef() + defer overlayDirent.DecRef(ctx) // Create a new overlay file that wraps the upper file. flags.Pread = upperFile.Flags().Pread @@ -399,7 +398,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) { children, err := readdirOne(ctx, replaced) if err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } @@ -407,12 +406,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // included among the returned children, so we don't // need to bother checking for them. if len(children) > 0 { - replaced.DecRef() + replaced.DecRef(ctx) return syserror.ENOTEMPTY } } - replaced.DecRef() + replaced.DecRef(ctx) } } @@ -455,12 +454,12 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri // Grab the inode and drop the dirent, we don't need it. inode := d.Inode inode.IncRef() - d.DecRef() + d.DecRef(ctx) // Create a new overlay entry and dirent for the socket. entry, err := newOverlayEntry(ctx, inode, nil, false) if err != nil { - inode.DecRef() + inode.DecRef(ctx) return nil, err } // Use the parent's MountSource, since that corresponds to the overlay, @@ -539,7 +538,7 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin // Don't forward the value of the extended attribute if it would // unexpectedly change the behavior of a wrapping overlay layer. - if strings.HasPrefix(XattrOverlayPrefix, name) { + if isXattrOverlay(name) { return "", syserror.ENODATA } @@ -553,9 +552,9 @@ func overlayGetXattr(ctx context.Context, o *overlayEntry, name string, size uin return s, err } -func overlaySetxattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error { +func overlaySetXattr(ctx context.Context, o *overlayEntry, d *Dirent, name, value string, flags uint32) error { // Don't allow changes to overlay xattrs through a setxattr syscall. - if strings.HasPrefix(XattrOverlayPrefix, name) { + if isXattrOverlay(name) { return syserror.EPERM } @@ -578,7 +577,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st for name := range names { // Same as overlayGetXattr, we shouldn't forward along // overlay attributes. - if strings.HasPrefix(XattrOverlayPrefix, name) { + if isXattrOverlay(name) { delete(names, name) } } @@ -587,7 +586,7 @@ func overlayListXattr(ctx context.Context, o *overlayEntry, size uint64) (map[st func overlayRemoveXattr(ctx context.Context, o *overlayEntry, d *Dirent, name string) error { // Don't allow changes to overlay xattrs through a removexattr syscall. - if strings.HasPrefix(XattrOverlayPrefix, name) { + if isXattrOverlay(name) { return syserror.EPERM } @@ -672,7 +671,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { // ground and claim that jumping around the filesystem like this // is not supported. name, _ := dirent.FullName(nil) - dirent.DecRef() + dirent.DecRef(ctx) // Claim that the path is not accessible. err = syserror.EACCES diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 389c219d6..aa9851b26 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -316,7 +316,7 @@ func TestCacheFlush(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = &rootContext{ Context: ctx, @@ -345,7 +345,7 @@ func TestCacheFlush(t *testing.T) { } // Drop the file reference. - file.DecRef() + file.DecRef(ctx) // Dirent should have 2 refs left. if got, want := dirent.ReadRefs(), 2; int(got) != want { @@ -361,7 +361,7 @@ func TestCacheFlush(t *testing.T) { } // Drop our ref. - dirent.DecRef() + dirent.DecRef(ctx) // We should be back to zero refs. if got, want := dirent.ReadRefs(), 0; int(got) != want { @@ -398,7 +398,7 @@ func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags if err != nil { return nil, err } - defer file.DecRef() + defer file.DecRef(ctx) // Wrap the file's FileOperations in a dirFile. fops := &dirFile{ FileOperations: file.FileOperations, diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index e3a715c1f..c5c07d564 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -80,7 +80,7 @@ func NewInotify(ctx context.Context) *Inotify { // Release implements FileOperations.Release. Release removes all watches and // frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.targetDestroyed from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all @@ -93,7 +93,7 @@ func (i *Inotify) Release() { // the owner's destructor. w.target.Watches.Remove(w.ID()) // Don't leak any references to the target, held by pins in the watch. - w.destroy() + w.destroy(ctx) } } @@ -321,7 +321,7 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 { // // RmWatch looks up an inotify watch for the given 'wd' and configures the // target dirent to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -346,7 +346,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) // Remove all pins. - watch.destroy() + watch.destroy(ctx) return nil } diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go index 900cba3ca..605423d22 100644 --- a/pkg/sentry/fs/inotify_watch.go +++ b/pkg/sentry/fs/inotify_watch.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -105,12 +106,12 @@ func (w *Watch) Pin(d *Dirent) { // Unpin drops any extra refs held on dirent due to a previous Pin // call. Calling Unpin multiple times for the same dirent, or on a dirent // without a corresponding Pin call is a no-op. -func (w *Watch) Unpin(d *Dirent) { +func (w *Watch) Unpin(ctx context.Context, d *Dirent) { w.mu.Lock() defer w.mu.Unlock() if w.pins[d] { delete(w.pins, d) - d.DecRef() + d.DecRef(ctx) } } @@ -125,11 +126,11 @@ func (w *Watch) TargetDestroyed() { // this watch. Destroy does not cause any new events to be generated. The caller // is responsible for ensuring there are no outstanding references to this // watch. -func (w *Watch) destroy() { +func (w *Watch) destroy(ctx context.Context) { w.mu.Lock() defer w.mu.Unlock() for d := range w.pins { - d.DecRef() + d.DecRef(ctx) } w.pins = nil } diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 926538d90..8a5d9c7eb 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -62,7 +62,7 @@ import ( type LockType int // UniqueID is a unique identifier of the holder of a regional file lock. -type UniqueID uint64 +type UniqueID interface{} const ( // ReadLock describes a POSIX regional file lock to be taken @@ -98,12 +98,7 @@ type Lock struct { // If len(Readers) > 0 then HasWriter must be false. Readers map[UniqueID]bool - // HasWriter indicates that this is a write lock held by a single - // UniqueID. - HasWriter bool - - // Writer is only valid if HasWriter is true. It identifies a - // single write lock holder. + // Writer holds the writer unique ID. It's nil if there are no writers. Writer UniqueID } @@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock { case ReadLock: value.Readers[uid] = true case WriteLock: - value.HasWriter = true value.Writer = uid default: panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) @@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock { // isHeld returns true if uid is a holder of Lock. func (l Lock) isHeld(uid UniqueID) bool { - if l.HasWriter && l.Writer == uid { - return true - } - return l.Readers[uid] + return l.Writer == uid || l.Readers[uid] } // lock sets uid as a holder of a typed lock on Lock. @@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) { } // We cannot downgrade a write lock to a read lock unless the // uid is the same. - if l.HasWriter { + if l.Writer != nil { if l.Writer != uid { panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) } // Ensure that there is only one reader if upgrading. l.Readers = make(map[UniqueID]bool) // Ensure that there is no longer a writer. - l.HasWriter = false + l.Writer = nil } l.Readers[uid] = true return case WriteLock: // If we are already the writer, then this is a no-op. - if l.HasWriter && l.Writer == uid { + if l.Writer == uid { return } // We can only upgrade a read lock to a write lock if there @@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) { } // Ensure that there is only a writer. l.Readers = make(map[UniqueID]bool) - l.HasWriter = true l.Writer = uid default: panic(fmt.Sprintf("lock: invalid lock type %d", t)) @@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { switch t { case ReadLock: return l.lockable(r, func(value Lock) bool { - // If there is no writer, there's no problem adding - // another reader. - if !value.HasWriter { + // If there is no writer, there's no problem adding another reader. + if value.Writer == nil { return true } // If there is a writer, then it must be the same uid @@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { case WriteLock: return l.lockable(r, func(value Lock) bool { // If there are only readers. - if !value.HasWriter { - // Then this uid can only take a write lock if - // this is a private upgrade, meaning that the - // only reader is uid. + if value.Writer == nil { + // Then this uid can only take a write lock if this is a private + // upgrade, meaning that the only reader is uid. return len(value.Readers) == 1 && value.Readers[uid] } // If the uid is already a writer on this region, then @@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { } } -// lock returns true if uid took a lock of type t on the entire range of LockRange. +// lock returns true if uid took a lock of type t on the entire range of +// LockRange. // // Preconditions: r.Start <= r.End (will panic otherwise). func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { @@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { seg, _ = l.SplitUnchecked(seg, r.End) } - // Set the lock on the segment. This is guaranteed to + // Set the lock on the segment. This is guaranteed to // always be safe, given canLock above. value := seg.ValuePtr() value.lock(uid, t) @@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { value := seg.Value() var remove bool - if value.HasWriter && value.Writer == uid { + if value.Writer == uid { // If we are unlocking a writer, then since there can // only ever be one writer and no readers, then this // lock should always be removed from the set. diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go index 8a3ace0c1..50a16e662 100644 --- a/pkg/sentry/fs/lock/lock_set_functions.go +++ b/pkg/sentry/fs/lock/lock_set_functions.go @@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) return Lock{}, false } } - if val1.HasWriter != val2.HasWriter { + if val1.Writer != val2.Writer { return Lock{}, false } - if val1.HasWriter { - if val1.Writer != val2.Writer { - return Lock{}, false - } - } return val1, true } @@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) for k, v := range val.Readers { val0.Readers[k] = v } - val0.HasWriter = val.HasWriter val0.Writer = val.Writer return val, val0 diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go index ba002aeb7..fad90984b 100644 --- a/pkg/sentry/fs/lock/lock_test.go +++ b/pkg/sentry/fs/lock/lock_test.go @@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool { if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) { return false } - if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter { - return false - } if e0[i].Lock.Writer != e1[i].Lock.Writer { return false } @@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) { LockRange: LockRange{2048, 3072}, }, { - Lock: Lock{HasWriter: true, Writer: 1}, + Lock: Lock{Writer: 1}, LockRange: LockRange{3072, 4096}, }, }) @@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 4096}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 4096}, }, { @@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 4096}, }, { @@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 3072}, }, { @@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) { } for _, test := range tests { - l := fill(test.before) + t.Run(test.name, func(t *testing.T) { + l := fill(test.before) - r := LockRange{Start: test.start, End: test.end} - success := l.lock(test.uid, test.lockType, r) - var got []entry - for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - got = append(got, entry{ - Lock: seg.Value(), - LockRange: seg.Range(), - }) - } + r := LockRange{Start: test.start, End: test.end} + success := l.lock(test.uid, test.lockType, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } - if success != test.success { - t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success) - continue - } + if success != test.success { + t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success) + return + } - if success { - if !equals(got, test.after) { - t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) + if success { + if !equals(got, test.after) { + t.Errorf("got set %+v, want %+v", got, test.after) + } } - } + }) } } @@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) { // 0 4096 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 4096}, }, }, @@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 4096}, }, { @@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) { // 0 8 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 8}, }, { @@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 8192 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) { } for _, test := range tests { - l := fill(test.before) + t.Run(test.name, func(t *testing.T) { + l := fill(test.before) - r := LockRange{Start: test.start, End: test.end} - l.unlock(test.uid, r) - var got []entry - for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - got = append(got, entry{ - Lock: seg.Value(), - LockRange: seg.Range(), - }) - } - if !equals(got, test.after) { - t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) - } + r := LockRange{Start: test.start, End: test.end} + l.unlock(test.uid, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } + if !equals(got, test.after) { + t.Errorf("got set %+v, want %+v", got, test.after) + } + }) } } diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 37bae6810..ee69b10e8 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -51,7 +51,7 @@ type MountSourceOperations interface { DirentOperations // Destroy destroys the MountSource. - Destroy() + Destroy(ctx context.Context) // Below are MountSourceOperations that do not conform to Linux. @@ -165,16 +165,16 @@ func (msrc *MountSource) DecDirentRefs() { } } -func (msrc *MountSource) destroy() { +func (msrc *MountSource) destroy(ctx context.Context) { if c := msrc.DirentRefs(); c != 0 { panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c)) } - msrc.MountSourceOperations.Destroy() + msrc.MountSourceOperations.Destroy(ctx) } // DecRef drops a reference on the MountSource. -func (msrc *MountSource) DecRef() { - msrc.DecRefWithDestructor(msrc.destroy) +func (msrc *MountSource) DecRef(ctx context.Context) { + msrc.DecRefWithDestructor(ctx, msrc.destroy) } // FlushDirentRefs drops all references held by the MountSource on Dirents. @@ -264,7 +264,7 @@ func (*SimpleMountSourceOperations) ResetInodeMappings() {} func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (*SimpleMountSourceOperations) Destroy() {} +func (*SimpleMountSourceOperations) Destroy(context.Context) {} // Info defines attributes of a filesystem. type Info struct { diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 78e35b1e6..7badc75d6 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -115,9 +115,9 @@ func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path strin } // Destroy drops references on the upper and lower MountSource. -func (o *overlayMountSourceOperations) Destroy() { - o.upper.DecRef() - o.lower.DecRef() +func (o *overlayMountSourceOperations) Destroy(ctx context.Context) { + o.upper.DecRef(ctx) + o.lower.DecRef(ctx) } // type overlayFilesystem is the filesystem for overlay mounts. diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index a3d10770b..6c296f5d0 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -18,6 +18,7 @@ import ( "fmt" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" ) @@ -32,13 +33,13 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool { return false } -func mountPathsAre(root *Dirent, got []*Mount, want ...string) error { +func mountPathsAre(ctx context.Context, root *Dirent, got []*Mount, want ...string) error { gotPaths := make(map[string]struct{}, len(got)) gotStr := make([]string, len(got)) for i, g := range got { if groot := g.Root(); groot != nil { name, _ := groot.FullName(root) - groot.DecRef() + groot.DecRef(ctx) gotStr[i] = name gotPaths[name] = struct{}{} } @@ -69,7 +70,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Get a child of the root which we will mount over. Note that the // MockInodeOperations causes Walk to always succeed. @@ -125,7 +126,7 @@ func TestAllMountsUnder(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -150,14 +151,14 @@ func TestAllMountsUnder(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } // mm root should contain all submounts (and does not include the root mount). rootMnt := mm.FindMount(rootDirent) submounts := mm.AllMountsUnder(rootMnt) allPaths := append(paths, "/") - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } @@ -181,9 +182,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/foo", err) } - defer d.DecRef() + defer d.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(d)) - if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { t.Error(err) } @@ -193,9 +194,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err) } - defer waldo.DecRef() + defer waldo.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(waldo)) - if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/waldo"); err != nil { t.Error(err) } } @@ -212,7 +213,7 @@ func TestUnmount(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -240,7 +241,7 @@ func TestUnmount(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } allPaths := make([]string, len(paths)+1) @@ -259,13 +260,13 @@ func TestUnmount(t *testing.T) { if err := mm.Unmount(ctx, d, false); err != nil { t.Fatalf("could not unmount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) // Remove the path that has been unmounted and the check that the remaining // mounts are still there. allPaths = allPaths[:len(allPaths)-1] submounts := mm.AllMountsUnder(rootMnt) - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 3f2bd0e87..d741c4339 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -234,7 +234,7 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() { // After destroy is called, the MountNamespace may continue to be referenced (for // example via /proc/mounts), but should free all resources and shouldn't have // Find* methods called. -func (mns *MountNamespace) destroy() { +func (mns *MountNamespace) destroy(ctx context.Context) { mns.mu.Lock() defer mns.mu.Unlock() @@ -247,13 +247,13 @@ func (mns *MountNamespace) destroy() { for _, mp := range mns.mounts { // Drop the mount reference on all mounted dirents. for ; mp != nil; mp = mp.previous { - mp.root.DecRef() + mp.root.DecRef(ctx) } } mns.mounts = nil // Drop reference on the root. - mns.root.DecRef() + mns.root.DecRef(ctx) // Ensure that root cannot be accessed via this MountNamespace any // more. @@ -265,8 +265,8 @@ func (mns *MountNamespace) destroy() { } // DecRef implements RefCounter.DecRef with destructor mns.destroy. -func (mns *MountNamespace) DecRef() { - mns.DecRefWithDestructor(mns.destroy) +func (mns *MountNamespace) DecRef(ctx context.Context) { + mns.DecRefWithDestructor(ctx, mns.destroy) } // withMountLocked prevents further walks to `node`, because `node` is about to @@ -312,7 +312,7 @@ func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode if err != nil { return err } - defer replacement.DecRef() + defer replacement.DecRef(ctx) // Set the mount's root dirent and id. parentMnt := mns.findMountLocked(mountPoint) @@ -394,7 +394,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev)) } // Drop mount reference taken at the end of MountNamespace.Mount. - prev.root.DecRef() + prev.root.DecRef(ctx) } else { mns.mounts[prev.root] = prev } @@ -496,11 +496,11 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // non-directory root is hopeless. if current != root { if !IsDir(current.Inode.StableAttr) { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, syserror.ENOTDIR } if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, err } } @@ -511,12 +511,12 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // Allow failed walks to cache the dirent, because no // children will acquire a reference at the end. current.maybeExtendReference() - current.DecRef() + current.DecRef(ctx) return nil, err } // Drop old reference. - current.DecRef() + current.DecRef(ctx) if remainder != "" { // Ensure it's resolved, unless it's the last level. @@ -570,11 +570,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema case nil: // Make sure we didn't exhaust the traversal budget. if *remainingTraversals == 0 { - target.DecRef() + target.DecRef(ctx) return nil, syscall.ELOOP } - node.DecRef() // Drop the original reference. + node.DecRef(ctx) // Drop the original reference. return target, nil case syscall.ENOLINK: @@ -582,7 +582,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return node, nil case ErrResolveViaReadlink: - defer node.DecRef() // See above. + defer node.DecRef(ctx) // See above. // First, check if we should traverse. if *remainingTraversals == 0 { @@ -608,7 +608,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return d, err default: - node.DecRef() // Drop for err; see above. + node.DecRef(ctx) // Drop for err; see above. // Propagate the error. return nil, err diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go index a69b41468..975d6cbc9 100644 --- a/pkg/sentry/fs/mounts_test.go +++ b/pkg/sentry/fs/mounts_test.go @@ -51,7 +51,7 @@ func TestFindLink(t *testing.T) { } root := mm.Root() - defer root.DecRef() + defer root.DecRef(ctx) foo, err := root.Walk(ctx, root, "foo") if err != nil { t.Fatalf("Error walking to foo: %v", err) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index a8ae7d81d..01a1235b8 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -86,13 +86,12 @@ func isXattrOverlay(name string) bool { // NewOverlayRoot produces the root of an overlay. // // Preconditions: -// -// - upper and lower must be non-nil. -// - upper must not be an overlay. -// - lower should not expose character devices, pipes, or sockets, because +// * upper and lower must be non-nil. +// * upper must not be an overlay. +// * lower should not expose character devices, pipes, or sockets, because // copying up these types of files is not supported. -// - lower must not require that file objects be revalidated. -// - lower must not have dynamic file/directory content. +// * lower must not require that file objects be revalidated. +// * lower must not have dynamic file/directory content. func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) { if !IsDir(upper.StableAttr) { return nil, fmt.Errorf("upper Inode is a %v, not a directory", upper.StableAttr.Type) @@ -107,7 +106,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, upper, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } @@ -117,12 +116,11 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount // NewOverlayRootFile produces the root of an overlay that points to a file. // // Preconditions: -// -// - lower must be non-nil. -// - lower should not expose character devices, pipes, or sockets, because +// * lower must be non-nil. +// * lower should not expose character devices, pipes, or sockets, because // copying up these types of files is not supported. Neither it can be a dir. -// - lower must not require that file objects be revalidated. -// - lower must not have dynamic file/directory content. +// * lower must not require that file objects be revalidated. +// * lower must not have dynamic file/directory content. func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, flags MountSourceFlags) (*Inode, error) { if !IsRegular(lower.StableAttr) { return nil, fmt.Errorf("lower Inode is not a regular file") @@ -130,7 +128,7 @@ func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, nil, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } return newOverlayInode(ctx, overlay, msrc), nil @@ -230,16 +228,16 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist }, nil } -func (o *overlayEntry) release() { +func (o *overlayEntry) release(ctx context.Context) { // We drop a reference on upper and lower file system Inodes // rather than releasing them, because in-memory filesystems // may hold an extra reference to these Inodes so that they // stay in memory. if o.upper != nil { - o.upper.DecRef() + o.upper.DecRef(ctx) } if o.lower != nil { - o.lower.DecRef() + o.lower.DecRef(ctx) } } diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 77c2c5c0e..b8b2281a8 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -50,6 +50,7 @@ go_library( "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/tcpip/network/ipv4", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 35972e23c..45523adf8 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -56,11 +56,11 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // readDescriptors reads fds in the task starting at offset, and calls the // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. -func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { +func readDescriptors(ctx context.Context, t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { var fds []int32 t.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -116,7 +116,7 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } n, _ := f.file.Dirent.FullName(root) return n, nil @@ -135,13 +135,7 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error { func (f *fd) Release(ctx context.Context) { f.Symlink.Release(ctx) - f.file.DecRef() -} - -// Close releases the reference on the file. -func (f *fd) Close() error { - f.file.DecRef() - return nil + f.file.DecRef(ctx) } // fdDir is an InodeOperations for /proc/TID/fd. @@ -227,7 +221,7 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer if f.isInfoFile { typ = fs.Symlink } - return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { + return readDescriptors(ctx, f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { return fs.GenericDentAttr(typ, device.ProcDevice) }) } @@ -261,7 +255,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs // locks, and other data. For now we only have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags() - file.DecRef() + file.DecRef(ctx) contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags)) return newStaticProcInode(ctx, dir.MountSource, contents) }) diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index 1fc9c703c..6a63c47b3 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -47,7 +47,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { // The task has been destroyed. Nothing to show here. return } - defer rootDir.DecRef() + defer rootDir.DecRef(t) mnt := t.MountNamespace().FindMount(rootDir) if mnt == nil { @@ -64,7 +64,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { continue // No longer valid. } mountPath, desc := mroot.FullName(rootDir) - mroot.DecRef() + mroot.DecRef(t) if !desc { // MountSources that are not descendants of the chroot jail are ignored. continue @@ -97,7 +97,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se if mroot == nil { return // No longer valid. } - defer mroot.DecRef() + defer mroot.DecRef(ctx) // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue @@ -216,7 +216,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan if root == nil { return // No longer valid. } - defer root.DecRef() + defer root.DecRef(ctx) flags := root.Inode.MountSource.Flags opts := "rw" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index bd18177d4..83a43aa26 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -419,7 +419,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } sfile := s.(*fs.File) if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -479,7 +479,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -574,7 +574,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -664,7 +664,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -752,7 +752,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -822,7 +822,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index c659224a7..77e0e1d26 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -213,7 +213,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent // Add dot and dotdot. root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) names = append(names, ".", "..") diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 702fdd392..e555672ad 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -54,7 +55,7 @@ type tcpMemInode struct { // size stores the tcp buffer size during save, and sets the buffer // size in netstack in restore. We must save/restore this here, since - // netstack itself is stateless. + // a netstack instance is created on restore. size inet.TCPBufferSize // mu protects against concurrent reads/writes to files based on this @@ -258,6 +259,9 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque if src.NumBytes() == 0 { return 0, nil } + + // Only consider size of one memory page for input for performance reasons. + // We are only reading if it's zero or not anyway. src = src.TakeFirst(usermem.PageSize - 1) var v int32 @@ -272,6 +276,96 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled) } +// +stateify savable +type tcpRecovery struct { + fsutil.SimpleFileInode + + stack inet.Stack `state:"wait"` + recovery inet.TCPLossRecovery +} + +func newTCPRecoveryInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + ts := &tcpRecovery{ + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + stack: s, + } + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(ctx, ts, msrc, sattr) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*tcpRecovery) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (r *tcpRecovery) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + flags.Pread = true + flags.Pwrite = true + return fs.NewFile(ctx, dirent, flags, &tcpRecoveryFile{ + tcpRecovery: r, + stack: r.stack, + }), nil +} + +// +stateify savable +type tcpRecoveryFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + tcpRecovery *tcpRecovery + + stack inet.Stack `state:"wait"` +} + +// Read implements fs.FileOperations.Read. +func (f *tcpRecoveryFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + + recovery, err := f.stack.TCPRecovery() + if err != nil { + return 0, err + } + f.tcpRecovery.recovery = recovery + s := fmt.Sprintf("%d\n", f.tcpRecovery.recovery) + n, err := dst.CopyOut(ctx, []byte(s)) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *tcpRecoveryFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + f.tcpRecovery.recovery = inet.TCPLossRecovery(v) + if err := f.tcpRecovery.stack.SetTCPRecovery(f.tcpRecovery.recovery); err != nil { + return 0, err + } + return n, nil +} + func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -293,11 +387,125 @@ func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.S return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) } +// ipForwarding implements fs.InodeOperations. +// +// ipForwarding is used to enable/disable packet forwarding of netstack. +// +// +stateify savable +type ipForwarding struct { + fsutil.SimpleFileInode + + stack inet.Stack `state:"wait"` + + // enabled stores the IPv4 forwarding state on save. + // We must save/restore this here, since a netstack instance + // is created on restore. + enabled *bool +} + +func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + ipf := &ipForwarding{ + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), + stack: s, + } + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(ctx, ipf, msrc, sattr) +} + +// Truncate implements fs.InodeOperations.Truncate. Truncate is called when +// O_TRUNC is specified for any kind of existing Dirent but is not called via +// (f)truncate for proc files. +func (*ipForwarding) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// +stateify savable +type ipForwardingFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + ipf *ipForwarding + + stack inet.Stack `state:"wait"` +} + +// GetFile implements fs.InodeOperations.GetFile. +func (ipf *ipForwarding) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + flags.Pread = true + flags.Pwrite = true + return fs.NewFile(ctx, dirent, flags, &ipForwardingFile{ + stack: ipf.stack, + ipf: ipf, + }), nil +} + +// Read implements fs.FileOperations.Read. +func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + + if f.ipf.enabled == nil { + enabled := f.stack.Forwarding(ipv4.ProtocolNumber) + f.ipf.enabled = &enabled + } + + val := "0\n" + if *f.ipf.enabled { + // Technically, this is not quite compatible with Linux. Linux + // stores these as an integer, so if you write "2" into + // ip_forward, you should get 2 back. + val = "1\n" + } + n, err := dst.CopyOut(ctx, []byte(val)) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +// +// Offset is ignored, multiple writes are not supported. +func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Only consider size of one memory page for input for performance reasons. + // We are only reading if it's zero or not anyway. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return n, err + } + if f.ipf.enabled == nil { + f.ipf.enabled = new(bool) + } + *f.ipf.enabled = v != 0 + return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled) +} + func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { contents := map[string]*fs.Inode{ // Add tcp_sack. "tcp_sack": newTCPSackInode(ctx, msrc, s), + // Add ip_forward. + "ip_forward": newIPForwardingInode(ctx, msrc, s), + // The following files are simple stubs until they are // implemented in netstack, most of these files are // configuration related. We use the value closest to the @@ -351,6 +559,11 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem) } + // Add tcp_recovery. + if _, err := s.TCPRecovery(); err == nil { + contents["tcp_recovery"] = newTCPRecoveryInode(ctx, msrc, s) + } + d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) } diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go index 6eba709c6..4cb4741af 100644 --- a/pkg/sentry/fs/proc/sys_net_state.go +++ b/pkg/sentry/fs/proc/sys_net_state.go @@ -14,7 +14,11 @@ package proc -import "fmt" +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" +) // beforeSave is invoked by stateify. func (t *tcpMemInode) beforeSave() { @@ -40,3 +44,12 @@ func (s *tcpSack) afterLoad() { } } } + +// afterLoad is invoked by stateify. +func (ipf *ipForwarding) afterLoad() { + if ipf.enabled != nil { + if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil { + panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err)) + } + } +} diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go index 355e83d47..6ef5738e7 100644 --- a/pkg/sentry/fs/proc/sys_net_test.go +++ b/pkg/sentry/fs/proc/sys_net_test.go @@ -123,3 +123,76 @@ func TestConfigureRecvBufferSize(t *testing.T) { } } } + +// TestIPForwarding tests the implementation of +// /proc/sys/net/ipv4/ip_forwarding +func TestIPForwarding(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + + var cases = []struct { + comment string + initial bool + str string + final bool + }{ + { + comment: `Forwarding is disabled; write 1 and enable forwarding`, + initial: false, + str: "1", + final: true, + }, + { + comment: `Forwarding is disabled; write 0 and disable forwarding`, + initial: false, + str: "0", + final: false, + }, + { + comment: `Forwarding is enabled; write 1 and enable forwarding`, + initial: true, + str: "1", + final: true, + }, + { + comment: `Forwarding is enabled; write 0 and disable forwarding`, + initial: true, + str: "0", + final: false, + }, + { + comment: `Forwarding is disabled; write 2404 and enable forwarding`, + initial: false, + str: "2404", + final: true, + }, + { + comment: `Forwarding is enabled; write 2404 and enable forwarding`, + initial: true, + str: "2404", + final: true, + }, + } + for _, c := range cases { + t.Run(c.comment, func(t *testing.T) { + s.IPForwarding = c.initial + ipf := &ipForwarding{stack: s} + file := &ipForwardingFile{ + stack: s, + ipf: ipf, + } + + // Write the values. + src := usermem.BytesIOSequence([]byte(c.str)) + if n, err := file.Write(ctx, nil, src, 0); n != int64(len(c.str)) || err != nil { + t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str)) + } + + // Read the values from the stack and check them. + if got, want := s.IPForwarding, c.final; got != want { + t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want) + } + + }) + } +} diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 4bbe90198..450044c9c 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo "auxv": newAuxvec(t, msrc), "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), "comm": newComm(t, msrc), + "cwd": newCwd(t, msrc), "environ": newExecArgInode(t, msrc, environExecArg), "exe": newExe(t, msrc), "fd": newFdDir(t, msrc), @@ -91,6 +92,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo "gid_map": newGIDMap(t, msrc), "io": newIO(t, msrc, isThreadGroup), "maps": newMaps(t, msrc), + "mem": newMem(t, msrc), "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), "net": newNetDir(t, msrc), @@ -185,7 +187,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry // Serialize "." and "..". root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) if err := dirCtx.DirEmit(".", dot); err != nil { @@ -295,11 +297,54 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } +// cwd is an fs.InodeOperations symlink for the /proc/PID/cwd file. +// +// +stateify savable +type cwd struct { + ramfs.Symlink + + t *kernel.Task +} + +func newCwd(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + cwdSymlink := &cwd{ + Symlink: *ramfs.NewSymlink(t, fs.RootOwner, ""), + t: t, + } + return newProcInode(t, cwdSymlink, msrc, fs.Symlink, t) +} + +// Readlink implements fs.InodeOperations. +func (e *cwd) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if !kernel.ContextCanTrace(ctx, e.t, false) { + return "", syserror.EACCES + } + if err := checkTaskState(e.t); err != nil { + return "", err + } + cwd := e.t.FSContext().WorkingDirectory() + if cwd == nil { + // It could have raced with process deletion. + return "", syserror.ESRCH + } + defer cwd.DecRef(ctx) + + root := fs.RootFromContext(ctx) + if root == nil { + // It could have raced with process deletion. + return "", syserror.ESRCH + } + defer root.DecRef(ctx) + + name, _ := cwd.FullName(root) + return name, nil +} + // namespaceSymlink represents a symlink in the namespacefs, such as the files // in /proc/<pid>/ns. // @@ -355,6 +400,88 @@ func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { return newProcInode(t, d, msrc, fs.SpecialDirectory, t) } +// memData implements fs.Inode for /proc/[pid]/mem. +// +// +stateify savable +type memData struct { + fsutil.SimpleFileInode + + t *kernel.Task +} + +// memDataFile implements fs.FileOperations for /proc/[pid]/mem. +// +// +stateify savable +type memDataFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoWrite `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + t *kernel.Task +} + +func newMem(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + inode := &memData{ + SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0400), linux.PROC_SUPER_MAGIC), + t: t, + } + return newProcInode(t, inode, msrc, fs.SpecialFile, t) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (m *memData) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (m *memData) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS + // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS + // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH + if !kernel.ContextCanTrace(ctx, m.t, true) { + return nil, syserror.EACCES + } + if err := checkTaskState(m.t); err != nil { + return nil, err + } + // Enable random access reads + flags.Pread = true + return fs.NewFile(ctx, dirent, flags, &memDataFile{t: m.t}), nil +} + +// Read implements fs.FileOperations.Read. +func (m *memDataFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + mm, err := getTaskMM(m.t) + if err != nil { + return 0, nil + } + defer mm.DecUsers(ctx) + // Buffer the read data because of MM locks + buf := make([]byte, dst.NumBytes()) + n, readErr := mm.CopyIn(ctx, usermem.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) + if n > 0 { + if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { + return 0, syserror.EFAULT + } + return int64(n), nil + } + if readErr != nil { + return 0, syserror.EIO + } + return 0, nil +} + // mapsData implements seqfile.SeqSource for /proc/[pid]/maps. // // +stateify savable @@ -604,7 +731,7 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( var vss, rss, data uint64 s.t.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.Size() + fds = fdTable.CurrentMaxFDs() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index bfa304552..f4fcddecb 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -219,7 +219,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -250,7 +250,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -326,7 +326,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F // Create the Dirent and corresponding file. created := fs.NewDirent(ctx, inode, name) - defer created.DecRef() + defer created.DecRef(ctx) return created.Inode.GetFile(ctx, created, flags) } @@ -412,11 +412,11 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol } // Release implements fs.InodeOperation.Release. -func (d *Dir) Release(_ context.Context) { +func (d *Dir) Release(ctx context.Context) { // Drop references on all children. d.mu.Lock() for _, i := range d.children { - i.DecRef() + i.DecRef(ctx) } d.mu.Unlock() } @@ -456,7 +456,7 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirC func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, @@ -473,13 +473,13 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) { // dropped when that dirent is destroyed. inode.IncRef() d := fs.NewTransientDirent(inode) - defer d.DecRef() + defer d.DecRef(ctx) file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { return false, err } - defer file.DecRef() + defer file.DecRef(ctx) ser := &fs.CollectEntriesSerializer{} if err := file.Readdir(ctx, ser); err != nil { @@ -530,7 +530,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n if err != nil { return err } - inode.DecRef() + inode.DecRef(ctx) } // Be careful, we may have already grabbed this mutex above. diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go index a6ed8b2c5..3e0d1e07e 100644 --- a/pkg/sentry/fs/ramfs/tree_test.go +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -67,7 +67,7 @@ func TestMakeDirectoryTree(t *testing.T) { continue } root := mm.Root() - defer mm.DecRef() + defer mm.DecRef(ctx) for _, p := range test.subdirs { maxTraversals := uint(0) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 88c344089..f362ca9b6 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -55,7 +55,7 @@ type TimerOperations struct { func NewFile(ctx context.Context, c ktime.Clock) *fs.File { dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[timerfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) tops := &TimerOperations{} tops.timer = ktime.NewTimer(c, tops) // Timerfds reject writes, but the Write flag must be set in order to @@ -65,7 +65,7 @@ func NewFile(ctx context.Context, c ktime.Clock) *fs.File { } // Release implements fs.FileOperations.Release. -func (t *TimerOperations) Release() { +func (t *TimerOperations) Release(context.Context) { t.timer.Destroy() } diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go index aaba35502..d4d613ea9 100644 --- a/pkg/sentry/fs/tmpfs/file_test.go +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -46,7 +46,7 @@ func newFile(ctx context.Context) *fs.File { func TestGrow(t *testing.T) { ctx := contexttest.Context(t) f := newFile(ctx) - defer f.DecRef() + defer f.DecRef(ctx) abuf := bytes.Repeat([]byte{'a'}, 68) n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0) diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index 1dc75291d..fc0498f17 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -613,7 +613,7 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional } mf := f.kernel.MemoryFile() - cerr := f.data.Fill(ctx, required, optional, mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { // Newly-allocated pages are zeroed, so we don't need to do anything. return dsts.NumBytes(), nil }) diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index b095312fe..998b697ca 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -16,6 +16,8 @@ package tmpfs import ( + "math" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -32,9 +34,15 @@ import ( var fsInfo = fs.Info{ Type: linux.TMPFS_MAGIC, + // tmpfs currently does not support configurable size limits. In Linux, + // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from + // statfs(2). However, many applications treat this as having a size limit + // of 0. To work around this, claim to have a very large but non-zero size, + // chosen to ensure that BlockSize * Blocks does not overflow int64 (which + // applications may also handle incorrectly). // TODO(b/29637826): allow configuring a tmpfs size and enforce it. - TotalBlocks: 0, - FreeBlocks: 0, + TotalBlocks: math.MaxInt64 / usermem.PageSize, + FreeBlocks: math.MaxInt64 / usermem.PageSize, } // rename implements fs.InodeOperations.Rename for tmpfs nodes. diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 5cb0e0417..e6d0eb359 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -10,13 +10,14 @@ go_library( "line_discipline.go", "master.go", "queue.go", - "slave.go", + "replica.go", "terminal.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/marshal/primitive", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 108654827..c2da80bc2 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -37,14 +37,14 @@ import ( // This indirectly manages all terminals within the mount. // // New Terminals are created by masterInodeOperations.GetFile, which registers -// the slave Inode in the this directory for discovery via Lookup/Readdir. The -// slave inode is unregistered when the master file is Released, as the slave +// the replica Inode in the this directory for discovery via Lookup/Readdir. The +// replica inode is unregistered when the master file is Released, as the replica // is no longer discoverable at that point. // // References on the underlying Terminal are held by masterFileOperations and -// slaveInodeOperations. +// replicaInodeOperations. // -// masterInodeOperations and slaveInodeOperations hold a pointer to +// masterInodeOperations and replicaInodeOperations hold a pointer to // dirInodeOperations, which is reference counted by the refcount their // corresponding Dirents hold on their parent (this directory). // @@ -76,16 +76,16 @@ type dirInodeOperations struct { // master is the master PTY inode. master *fs.Inode - // slaves contains the slave inodes reachable from the directory. + // replicas contains the replica inodes reachable from the directory. // - // A new slave is added by allocateTerminal and is removed by + // A new replica is added by allocateTerminal and is removed by // masterFileOperations.Release. // - // A reference is held on every slave in the map. - slaves map[uint32]*fs.Inode + // A reference is held on every replica in the map. + replicas map[uint32]*fs.Inode // dentryMap is a SortedDentryMap used to implement Readdir containing - // the master and all entries in slaves. + // the master and all entries in replicas. dentryMap *fs.SortedDentryMap // next is the next pty index to use. @@ -101,7 +101,7 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode { d := &dirInodeOperations{ InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.RootOwner, fs.FilePermsFromMode(0555), linux.DEVPTS_SUPER_MAGIC), msrc: m, - slaves: make(map[uint32]*fs.Inode), + replicas: make(map[uint32]*fs.Inode), dentryMap: fs.NewSortedDentryMap(nil), } // Linux devpts uses a default mode of 0000 for ptmx which can be @@ -132,8 +132,8 @@ func (d *dirInodeOperations) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() - d.master.DecRef() - if len(d.slaves) != 0 { + d.master.DecRef(ctx) + if len(d.replicas) != 0 { panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d)) } } @@ -149,14 +149,14 @@ func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name str return fs.NewDirent(ctx, d.master, name), nil } - // Slave number? + // Replica number? n, err := strconv.ParseUint(name, 10, 32) if err != nil { // Not found. return nil, syserror.ENOENT } - s, ok := d.slaves[uint32(n)] + s, ok := d.replicas[uint32(n)] if !ok { return nil, syserror.ENOENT } @@ -236,7 +236,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e return nil, syserror.ENOMEM } - if _, ok := d.slaves[n]; ok { + if _, ok := d.replicas[n]; ok { panic(fmt.Sprintf("pty index collision; index %d already exists", n)) } @@ -244,41 +244,41 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e d.next++ // The reference returned by newTerminal is returned to the caller. - // Take another for the slave inode. + // Take another for the replica inode. t.IncRef() // Create a pts node. The owner is based on the context that opens // ptmx. creds := auth.CredentialsFromContext(ctx) uid, gid := creds.EffectiveKUID, creds.EffectiveKGID - slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666)) + replica := newReplicaInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666)) - d.slaves[n] = slave + d.replicas[n] = replica d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{ - Type: slave.StableAttr.Type, - InodeID: slave.StableAttr.InodeID, + Type: replica.StableAttr.Type, + InodeID: replica.StableAttr.InodeID, }) return t, nil } // masterClose is called when the master end of t is closed. -func (d *dirInodeOperations) masterClose(t *Terminal) { +func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) { d.mu.Lock() defer d.mu.Unlock() - // The slave end disappears from the directory when the master end is - // closed, even if the slave end is open elsewhere. + // The replica end disappears from the directory when the master end is + // closed, even if the replica end is open elsewhere. // // N.B. since we're using a backdoor method to remove a directory entry // we won't properly fire inotify events like Linux would. - s, ok := d.slaves[t.n] + s, ok := d.replicas[t.n] if !ok { panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d)) } - s.DecRef() - delete(d.slaves, t.n) + s.DecRef(ctx) + delete(d.replicas, t.n) d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10)) } @@ -322,7 +322,7 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCt func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 8fe05ebe5..13f4901db 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -79,8 +79,8 @@ type superOperations struct{} // // It always returns true, forcing a Lookup for all entries. // -// Slave entries are dropped from dir when their master is closed, so an -// existing slave Dirent in the tree is not sufficient to guarantee that it +// Replica entries are dropped from dir when their master is closed, so an +// existing replica Dirent in the tree is not sufficient to guarantee that it // still exists on the filesystem. func (superOperations) Revalidate(context.Context, string, *fs.Inode, *fs.Inode) bool { return true @@ -108,4 +108,4 @@ func (superOperations) ResetInodeMappings() {} func (superOperations) SaveInodeMapping(*fs.Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (superOperations) Destroy() {} +func (superOperations) Destroy(context.Context) {} diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 2e9dd2d55..b34f4a0eb 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -43,7 +44,7 @@ const ( ) // lineDiscipline dictates how input and output are handled between the -// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// pseudoterminal (pty) master and replica. It can be configured to alter I/O, // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man // pages are good resources for how to affect the line discipline: // @@ -54,8 +55,8 @@ const ( // // lineDiscipline has a simple structure but supports a multitude of options // (see the above man pages). It consists of two queues of bytes: one from the -// terminal master to slave (the input queue) and one from slave to master (the -// output queue). When bytes are written to one end of the pty, the line +// terminal master to replica (the input queue) and one from replica to master +// (the output queue). When bytes are written to one end of the pty, the line // discipline reads the bytes, modifies them or takes special action if // required, and enqueues them to be read by the other end of the pty: // @@ -64,7 +65,7 @@ const ( // | (inputQueueWrite) +-------------+ (inputQueueRead) | // | | // | v -// masterFD slaveFD +// masterFD replicaFD // ^ | // | | // | output to terminal +--------------+ output from process | @@ -103,8 +104,8 @@ type lineDiscipline struct { // masterWaiter is used to wait on the master end of the TTY. masterWaiter waiter.Queue `state:"zerovalue"` - // slaveWaiter is used to wait on the slave end of the TTY. - slaveWaiter waiter.Queue `state:"zerovalue"` + // replicaWaiter is used to wait on the replica end of the TTY. + replicaWaiter waiter.Queue `state:"zerovalue"` } func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { @@ -115,27 +116,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { } // getTermios gets the linux.Termios for the tty. -func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.RLock() defer l.termiosMu.RUnlock() // We must copy a Termios struct, not KernelTermios. t := l.termios.ToTermios() - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := t.CopyOut(task, args[2].Pointer()) return 0, err } // setTermios sets a linux.Termios for the tty. -func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.Lock() defer l.termiosMu.Unlock() oldCanonEnabled := l.termios.LEnabled(linux.ICANON) // We must copy a Termios struct, not KernelTermios. var t linux.Termios - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := t.CopyIn(task, args[2].Pointer()) l.termios.FromTermios(t) // If canonical mode is turned off, move bytes from inQueue's wait @@ -146,27 +143,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = true l.inQueue.mu.Unlock() - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) } return 0, err } -func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := l.size.CopyOut(t, args[2].Pointer()) return err } -func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := l.size.CopyIn(t, args[2].Pointer()) return err } @@ -176,14 +169,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask { return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) } -func (l *lineDiscipline) slaveReadiness() waiter.EventMask { +func (l *lineDiscipline) replicaReadiness() waiter.EventMask { l.termiosMu.RLock() defer l.termiosMu.RUnlock() return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) } -func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { - return l.inQueue.readableSize(ctx, io, args) +func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error { + return l.inQueue.readableSize(t, args) } func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { @@ -196,7 +189,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque if n > 0 { l.masterWaiter.Notify(waiter.EventOut) if pushed { - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) } return n, nil } @@ -211,14 +204,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ return 0, err } if n > 0 { - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) return n, nil } return 0, syserror.ErrWouldBlock } -func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { - return l.outQueue.readableSize(ctx, io, args) +func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, args arch.SyscallArguments) error { + return l.outQueue.readableSize(t, args) } func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { @@ -229,7 +222,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ return 0, err } if n > 0 { - l.slaveWaiter.Notify(waiter.EventOut) + l.replicaWaiter.Notify(waiter.EventOut) if pushed { l.masterWaiter.Notify(waiter.EventIn) } diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index fe07fa929..b91184b1b 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -17,9 +17,11 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -75,7 +77,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn } // Release implements fs.InodeOperations.Release. -func (mi *masterInodeOperations) Release(ctx context.Context) { +func (mi *masterInodeOperations) Release(context.Context) { } // Truncate implements fs.InodeOperations.Truncate. @@ -120,9 +122,9 @@ type masterFileOperations struct { var _ fs.FileOperations = (*masterFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (mf *masterFileOperations) Release() { - mf.d.masterClose(mf.t) - mf.t.DecRef() +func (mf *masterFileOperations) Release(ctx context.Context) { + mf.d.masterClose(ctx, mf.t) + mf.t.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. @@ -152,46 +154,51 @@ func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src userm // Ioctl implements fs.FileOperations.Ioctl. func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // ioctl(2) may only be called from a task goroutine. + return 0, syserror.ENOTTY + } + switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the output queue read buffer. - return 0, mf.t.ld.outputQueueReadSize(ctx, io, args) + return 0, mf.t.ld.outputQueueReadSize(t, args) case linux.TCGETS: // N.B. TCGETS on the master actually returns the configuration - // of the slave end. - return mf.t.ld.getTermios(ctx, io, args) + // of the replica end. + return mf.t.ld.getTermios(t, args) case linux.TCSETS: // N.B. TCSETS on the master actually affects the configuration - // of the slave end. - return mf.t.ld.setTermios(ctx, io, args) + // of the replica end. + return mf.t.ld.setTermios(t, args) case linux.TCSETSW: // TODO(b/29356795): This should drain the output queue first. - return mf.t.ld.setTermios(ctx, io, args) + return mf.t.ld.setTermios(t, args) case linux.TIOCGPTN: - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{ - AddressSpaceActive: true, - }) + nP := primitive.Uint32(mf.t.n) + _, err := nP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCSPTLCK: // TODO(b/29356795): Implement pty locking. For now just pretend we do. return 0, nil case linux.TIOCGWINSZ: - return 0, mf.t.ld.windowSize(ctx, io, args) + return 0, mf.t.ld.windowSize(t, args) case linux.TIOCSWINSZ: - return 0, mf.t.ld.setWindowSize(ctx, io, args) + return 0, mf.t.ld.setWindowSize(t, args) case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. - return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */) + return 0, mf.t.setControllingTTY(ctx, args, true /* isMaster */) case linux.TIOCNOTTY: // Release this process's controlling terminal. - return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + return 0, mf.t.releaseControllingTTY(ctx, args, true /* isMaster */) case linux.TIOCGPGRP: // Get the foreground process group. - return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + return mf.t.foregroundProcessGroup(ctx, args, true /* isMaster */) case linux.TIOCSPGRP: // Set the foreground process group. - return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) + return mf.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) return 0, syserror.ENOTTY diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index ceabb9b1e..79975d812 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -17,8 +17,10 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -32,7 +34,7 @@ import ( const waitBufMaxBytes = 131072 // queue represents one of the input or output queues between a pty master and -// slave. Bytes written to a queue are added to the read buffer until it is +// replica. Bytes written to a queue are added to the read buffer until it is // full, at which point they are written to the wait buffer. Bytes are // processed (i.e. undergo termios transformations) as they are added to the // read buffer. The read buffer is readable when its length is nonzero and @@ -85,17 +87,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { } // readableSize writes the number of readable bytes to userspace. -func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (q *queue) readableSize(t *kernel.Task, args arch.SyscallArguments) error { q.mu.Lock() defer q.mu.Unlock() - var size int32 + size := primitive.Int32(0) if q.readable { - size = int32(len(q.readBuf)) + size = primitive.Int32(len(q.readBuf)) } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := size.CopyOut(t, args[2].Pointer()) return err } @@ -104,8 +104,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca // as whether the read caused more readable data to become available (whether // data was pushed from the wait buffer to the read buffer). // -// Preconditions: -// * l.termiosMu must be held for reading. +// Preconditions: l.termiosMu must be held for reading. func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { q.mu.Lock() defer q.mu.Unlock() @@ -145,8 +144,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl // write writes to q from userspace. // -// Preconditions: -// * l.termiosMu must be held for reading. +// Preconditions: l.termiosMu must be held for reading. func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { q.mu.Lock() defer q.mu.Unlock() @@ -188,8 +186,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip // writeBytes writes to q from b. // -// Preconditions: -// * l.termiosMu must be held for reading. +// Preconditions: l.termiosMu must be held for reading. func (q *queue) writeBytes(b []byte, l *lineDiscipline) { q.mu.Lock() defer q.mu.Unlock() diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/replica.go index 9871f6fc6..385d230fb 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/replica.go @@ -17,9 +17,11 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -27,11 +29,11 @@ import ( // LINT.IfChange -// slaveInodeOperations are the fs.InodeOperations for the slave end of the +// replicaInodeOperations are the fs.InodeOperations for the replica end of the // Terminal (pts file). // // +stateify savable -type slaveInodeOperations struct { +type replicaInodeOperations struct { fsutil.SimpleFileInode // d is the containing dir. @@ -41,13 +43,13 @@ type slaveInodeOperations struct { t *Terminal } -var _ fs.InodeOperations = (*slaveInodeOperations)(nil) +var _ fs.InodeOperations = (*replicaInodeOperations)(nil) -// newSlaveInode creates an fs.Inode for the slave end of a terminal. +// newReplicaInode creates an fs.Inode for the replica end of a terminal. // -// newSlaveInode takes ownership of t. -func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode { - iops := &slaveInodeOperations{ +// newReplicaInode takes ownership of t. +func newReplicaInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode { + iops := &replicaInodeOperations{ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, owner, p, linux.DEVPTS_SUPER_MAGIC), d: d, t: t, @@ -64,18 +66,18 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne Type: fs.CharacterDevice, // See fs/devpts/inode.c:devpts_fill_super. BlockSize: 1024, - DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR, + DeviceFileMajor: linux.UNIX98_PTY_REPLICA_MAJOR, DeviceFileMinor: t.n, }) } // Release implements fs.InodeOperations.Release. -func (si *slaveInodeOperations) Release(ctx context.Context) { - si.t.DecRef() +func (si *replicaInodeOperations) Release(ctx context.Context) { + si.t.DecRef(ctx) } // Truncate implements fs.InodeOperations.Truncate. -func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { +func (*replicaInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { return nil } @@ -83,14 +85,15 @@ func (*slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error { // // This may race with destruction of the terminal. If the terminal is gone, it // returns ENOENT. -func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { - return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil +func (si *replicaInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, d, flags, &replicaFileOperations{si: si}), nil } -// slaveFileOperations are the fs.FileOperations for the slave end of a terminal. +// replicaFileOperations are the fs.FileOperations for the replica end of a +// terminal. // // +stateify savable -type slaveFileOperations struct { +type replicaFileOperations struct { fsutil.FilePipeSeek `state:"nosave"` fsutil.FileNotDirReaddir `state:"nosave"` fsutil.FileNoFsync `state:"nosave"` @@ -100,79 +103,84 @@ type slaveFileOperations struct { fsutil.FileUseInodeUnstableAttr `state:"nosave"` // si is the inode operations. - si *slaveInodeOperations + si *replicaInodeOperations } -var _ fs.FileOperations = (*slaveFileOperations)(nil) +var _ fs.FileOperations = (*replicaFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (sf *slaveFileOperations) Release() { +func (sf *replicaFileOperations) Release(context.Context) { } // EventRegister implements waiter.Waitable.EventRegister. -func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - sf.si.t.ld.slaveWaiter.EventRegister(e, mask) +func (sf *replicaFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + sf.si.t.ld.replicaWaiter.EventRegister(e, mask) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) { - sf.si.t.ld.slaveWaiter.EventUnregister(e) +func (sf *replicaFileOperations) EventUnregister(e *waiter.Entry) { + sf.si.t.ld.replicaWaiter.EventUnregister(e) } // Readiness implements waiter.Waitable.Readiness. -func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { - return sf.si.t.ld.slaveReadiness() +func (sf *replicaFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return sf.si.t.ld.replicaReadiness() } // Read implements fs.FileOperations.Read. -func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { +func (sf *replicaFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { return sf.si.t.ld.inputQueueRead(ctx, dst) } // Write implements fs.FileOperations.Write. -func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { +func (sf *replicaFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { return sf.si.t.ld.outputQueueWrite(ctx, src) } // Ioctl implements fs.FileOperations.Ioctl. -func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (sf *replicaFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // ioctl(2) may only be called from a task goroutine. + return 0, syserror.ENOTTY + } + switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the input queue read buffer. - return 0, sf.si.t.ld.inputQueueReadSize(ctx, io, args) + return 0, sf.si.t.ld.inputQueueReadSize(t, args) case linux.TCGETS: - return sf.si.t.ld.getTermios(ctx, io, args) + return sf.si.t.ld.getTermios(t, args) case linux.TCSETS: - return sf.si.t.ld.setTermios(ctx, io, args) + return sf.si.t.ld.setTermios(t, args) case linux.TCSETSW: // TODO(b/29356795): This should drain the output queue first. - return sf.si.t.ld.setTermios(ctx, io, args) + return sf.si.t.ld.setTermios(t, args) case linux.TIOCGPTN: - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{ - AddressSpaceActive: true, - }) + nP := primitive.Uint32(sf.si.t.n) + _, err := nP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCGWINSZ: - return 0, sf.si.t.ld.windowSize(ctx, io, args) + return 0, sf.si.t.ld.windowSize(t, args) case linux.TIOCSWINSZ: - return 0, sf.si.t.ld.setWindowSize(ctx, io, args) + return 0, sf.si.t.ld.setWindowSize(t, args) case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. - return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */) + return 0, sf.si.t.setControllingTTY(ctx, args, false /* isMaster */) case linux.TIOCNOTTY: // Release this process's controlling terminal. - return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) + return 0, sf.si.t.releaseControllingTTY(ctx, args, false /* isMaster */) case linux.TIOCGPGRP: // Get the foreground process group. - return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) + return sf.si.t.foregroundProcessGroup(ctx, args, false /* isMaster */) case linux.TIOCSPGRP: // Set the foreground process group. - return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) + return sf.si.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) return 0, syserror.ENOTTY } } -// LINT.ThenChange(../../fsimpl/devpts/slave.go) +// LINT.ThenChange(../../fsimpl/devpts/replica.go) diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index ddcccf4da..4f431d74d 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -17,10 +17,10 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange @@ -44,19 +44,19 @@ type Terminal struct { // this terminal. This field is immutable. masterKTTY *kernel.TTY - // slaveKTTY contains the controlling process of the slave end of this + // replicaKTTY contains the controlling process of the replica end of this // terminal. This field is immutable. - slaveKTTY *kernel.TTY + replicaKTTY *kernel.TTY } func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal { - termios := linux.DefaultSlaveTermios + termios := linux.DefaultReplicaTermios t := Terminal{ - d: d, - n: n, - ld: newLineDiscipline(termios), - masterKTTY: &kernel.TTY{Index: n}, - slaveKTTY: &kernel.TTY{Index: n}, + d: d, + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + replicaKTTY: &kernel.TTY{Index: n}, } t.EnableLeakCheck("tty.Terminal") return &t @@ -64,7 +64,7 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal // setControllingTTY makes tm the controlling terminal of the calling thread // group. -func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { +func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error { task := kernel.TaskFromContext(ctx) if task == nil { panic("setControllingTTY must be called from a task context") @@ -75,7 +75,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a // releaseControllingTTY removes tm as the controlling terminal of the calling // thread group. -func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { +func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error { task := kernel.TaskFromContext(ctx) if task == nil { panic("releaseControllingTTY must be called from a task context") @@ -85,7 +85,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar } // foregroundProcessGroup gets the process group ID of tm's foreground process. -func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { panic("foregroundProcessGroup must be called from a task context") @@ -97,24 +97,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a } // Write it out to *arg. - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ - AddressSpaceActive: true, - }) + retP := primitive.Int32(ret) + _, err = retP.CopyOut(task, args[2].Pointer()) return 0, err } // foregroundProcessGroup sets tm's foreground process. -func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { panic("setForegroundProcessGroup must be called from a task context") } // Read in the process group ID. - var pgid int32 - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + var pgid primitive.Int32 + if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } @@ -126,7 +123,7 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { if isMaster { return tm.masterKTTY } - return tm.slaveKTTY + return tm.replicaKTTY } // LINT.ThenChange(../../fsimpl/devpts/terminal.go) diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go index 2cbc05678..49edee83d 100644 --- a/pkg/sentry/fs/tty/tty_test.go +++ b/pkg/sentry/fs/tty/tty_test.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func TestSimpleMasterToSlave(t *testing.T) { - ld := newLineDiscipline(linux.DefaultSlaveTermios) +func TestSimpleMasterToReplica(t *testing.T) { + ld := newLineDiscipline(linux.DefaultReplicaTermios) ctx := contexttest.Context(t) inBytes := []byte("hello, tty\n") src := usermem.BytesIOSequence(inBytes) diff --git a/pkg/sentry/fs/user/BUILD b/pkg/sentry/fs/user/BUILD index bd5dac373..66e949c95 100644 --- a/pkg/sentry/fs/user/BUILD +++ b/pkg/sentry/fs/user/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/sentry/fs", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/syserror", diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go index fbd4547a7..124bc95ed 100644 --- a/pkg/sentry/fs/user/path.go +++ b/pkg/sentry/fs/user/path.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -31,7 +32,15 @@ import ( // ResolveExecutablePath resolves the given executable name given the working // dir and environment. -func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs.MountNamespace, envv []string, wd, name string) (string, error) { +func ResolveExecutablePath(ctx context.Context, args *kernel.CreateProcessArgs) (string, error) { + name := args.Filename + if len(name) == 0 { + if len(args.Argv) == 0 { + return "", fmt.Errorf("no filename or command provided") + } + name = args.Argv[0] + } + // Absolute paths can be used directly. if path.IsAbs(name) { return name, nil @@ -40,6 +49,7 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs // Paths with '/' in them should be joined to the working directory, or // to the root if working directory is not set. if strings.IndexByte(name, '/') > 0 { + wd := args.WorkingDirectory if wd == "" { wd = "/" } @@ -49,16 +59,30 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs return path.Join(wd, name), nil } - // Otherwise, We must lookup the name in the paths, starting from the - // calling context's root directory. - paths := getPath(envv) + // Otherwise, We must lookup the name in the paths. + paths := getPath(args.Envv) + if kernel.VFS2Enabled { + f, err := resolveVFS2(ctx, args.Credentials, args.MountNamespaceVFS2, paths, name) + if err != nil { + return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err) + } + return f, nil + } + + f, err := resolve(ctx, args.MountNamespace, paths, name) + if err != nil { + return "", fmt.Errorf("error finding executable %q in PATH %v: %v", name, paths, err) + } + return f, nil +} +func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name string) (string, error) { root := fs.RootFromContext(ctx) if root == nil { // Caller has no root. Don't bother traversing anything. return "", syserror.ENOENT } - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -76,7 +100,7 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs if err != nil { return "", err } - defer d.DecRef() + defer d.DecRef(ctx) // Check that it is a regular file. if !fs.IsRegular(d.Inode.StableAttr) { @@ -95,32 +119,10 @@ func ResolveExecutablePath(ctx context.Context, creds *auth.Credentials, mns *fs return "", syserror.ENOENT } -// ResolveExecutablePathVFS2 resolves the given executable name given the -// working dir and environment. -func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, envv []string, wd, name string) (string, error) { - // Absolute paths can be used directly. - if path.IsAbs(name) { - return name, nil - } - - // Paths with '/' in them should be joined to the working directory, or - // to the root if working directory is not set. - if strings.IndexByte(name, '/') > 0 { - if wd == "" { - wd = "/" - } - if !path.IsAbs(wd) { - return "", fmt.Errorf("working directory %q must be absolute", wd) - } - return path.Join(wd, name), nil - } - - // Otherwise, We must lookup the name in the paths, starting from the - // calling context's root directory. - paths := getPath(envv) - +func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { root := mns.Root() - defer root.DecRef() + root.IncRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -147,7 +149,7 @@ func ResolveExecutablePathVFS2(ctx context.Context, creds *auth.Credentials, mns if err != nil { return "", err } - dentry.DecRef() + dentry.DecRef(ctx) return binPath, nil } diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go index f4d525523..1f8684dc6 100644 --- a/pkg/sentry/fs/user/user.go +++ b/pkg/sentry/fs/user/user.go @@ -62,7 +62,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K // doesn't exist we will return the default home directory. return defaultHome, nil } - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Check read permissions on the file. if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { @@ -81,7 +81,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K if err != nil { return "", err } - defer f.DecRef() + defer f.DecRef(ctx) r := &fileReader{ Ctx: ctx, @@ -105,7 +105,8 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. const defaultHome = "/" root := mns.Root() - defer root.DecRef() + root.IncRef() + defer root.DecRef(ctx) creds := auth.CredentialsFromContext(ctx) @@ -123,7 +124,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. if err != nil { return defaultHome, nil } - defer fd.DecRef() + defer fd.DecRef(ctx) r := &fileReaderVFS2{ ctx: ctx, diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go index 7d8e9ac7c..12b786224 100644 --- a/pkg/sentry/fs/user/user_test.go +++ b/pkg/sentry/fs/user/user_test.go @@ -39,7 +39,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer etc.DecRef() + defer etc.DecRef(ctx) switch mode.FileType() { case 0: // Don't create anything. @@ -49,7 +49,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer passwd.DecRef() + defer passwd.DecRef(ctx) if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil { return err } @@ -110,9 +110,9 @@ func TestGetExecUserHome(t *testing.T) { if err != nil { t.Fatalf("NewMountNamespace failed: %v", err) } - defer mns.DecRef() + defer mns.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = fs.WithRoot(ctx, root) if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil { diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go index 8e7590721..7e61209ee 100644 --- a/pkg/sentry/fsbridge/bridge.go +++ b/pkg/sentry/fsbridge/bridge.go @@ -44,7 +44,7 @@ type File interface { IncRef() // DecRef decrements reference. - DecRef() + DecRef(ctx context.Context) } // Lookup provides a common interface to open files. diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go index 093ce1fb3..9785fd62a 100644 --- a/pkg/sentry/fsbridge/fs.go +++ b/pkg/sentry/fsbridge/fs.go @@ -49,7 +49,7 @@ func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { // global there. return "" } - defer root.DecRef() + defer root.DecRef(ctx) name, _ := f.file.Dirent.FullName(root) return name @@ -87,8 +87,8 @@ func (f *fsFile) IncRef() { } // DecRef implements File. -func (f *fsFile) DecRef() { - f.file.DecRef() +func (f *fsFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // fsLookup implements Lookup interface using fs.File. @@ -124,7 +124,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio if err != nil { return nil, err } - defer d.DecRef() + defer d.DecRef(ctx) if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { return nil, syserror.ELOOP diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index 89168220a..be0900030 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -43,7 +43,7 @@ func NewVFSFile(file *vfs.FileDescription) File { // PathnameWithDeleted implements File. func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string { root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) @@ -86,8 +86,8 @@ func (f *VFSFile) IncRef() { } // DecRef implements File. -func (f *VFSFile) DecRef() { - f.file.DecRef() +func (f *VFSFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // FileDescription returns the FileDescription represented by f. It does not @@ -122,7 +122,7 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) // remainingTraversals is not configurable in VFS2, all callers are using the // default anyways. func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { - vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem() + vfsObj := l.root.Mount().Filesystem().VirtualFilesystem() creds := auth.CredentialsFromContext(ctx) path := fspath.Parse(pathname) pop := &vfs.PathOperation{ diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 585764223..6af3c3781 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -1,7 +1,19 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "root_inode_refs", + out = "root_inode_refs.go", + package = "devpts", + prefix = "rootInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "rootInode", + }, +) + go_library( name = "devpts", srcs = [ @@ -9,15 +21,23 @@ go_library( "line_discipline.go", "master.go", "queue.go", - "slave.go", + "replica.go", + "root_inode_refs.go", "terminal.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", + "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index c03c65445..9185877f6 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -35,29 +35,56 @@ import ( const Name = "devpts" // FilesystemType implements vfs.FilesystemType. -type FilesystemType struct{} +// +// +stateify savable +type FilesystemType struct { + initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported. + initErr error + + // fs backs all mounts of this FilesystemType. root is fs' root. fs and root + // are immutable. + fs *vfs.Filesystem + root *vfs.Dentry +} // Name implements vfs.FilesystemType.Name. -func (FilesystemType) Name() string { +func (*FilesystemType) Name() string { return Name } -var _ vfs.FilesystemType = (*FilesystemType)(nil) - // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // No data allowed. if opts.Data != "" { return nil, nil, syserror.EINVAL } - fs, root, err := fstype.newFilesystem(vfsObj, creds) - if err != nil { - return nil, nil, err + fstype.initOnce.Do(func() { + fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds) + if err != nil { + fstype.initErr = err + return + } + fstype.fs = fs.VFSFilesystem() + fstype.root = root.VFSDentry() + }) + if fstype.initErr != nil { + return nil, nil, fstype.initErr } - return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil + fstype.fs.IncRef() + fstype.root.IncRef() + return fstype.fs, fstype.root, nil } +// Release implements vfs.FilesystemType.Release. +func (fstype *FilesystemType) Release(ctx context.Context) { + if fstype.fs != nil { + fstype.root.DecRef(ctx) + fstype.fs.DecRef(ctx) + } +} + +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -66,7 +93,7 @@ type filesystem struct { // newFilesystem creates a new devpts filesystem with root directory and ptmx // master inode. It returns the filesystem and root Dentry. -func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { +func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err @@ -79,57 +106,60 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds // Construct the root directory. This is always inode id 1. root := &rootInode{ - slaves: make(map[uint32]*slaveInode), + replicas: make(map[uint32]*replicaInode), } - root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) + root.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - root.dentry.Init(root) + root.EnableLeakCheck() + + var rootD kernfs.Dentry + rootD.Init(&fs.Filesystem, root) // Construct the pts master inode and dentry. Linux always uses inode // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx. master := &masterInode{ root: root, } - master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) - master.dentry.Init(master) + master.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) // Add the master as a child of the root. - links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{ - "ptmx": &master.dentry, + links := root.OrderedChildren.Populate(map[string]kernfs.Inode{ + "ptmx": master, }) root.IncLinks(links) - return fs, &root.dentry, nil + return fs, &rootD, nil } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // rootInode is the root directory inode for the devpts mounts. +// +// +stateify savable type rootInode struct { - kernfs.AlwaysValid + implStatFS + kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren kernfs.InodeNotSymlink + kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.OrderedChildren + rootInodeRefs - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry + locks vfs.FileLocks // master is the master pty inode. Immutable. master *masterInode - // root is the root directory inode for this filesystem. Immutable. - root *rootInode - // mu protects the fields below. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` - // slaves maps pty ids to slave inodes. - slaves map[uint32]*slaveInode + // replicas maps pty ids to replica inodes. + replicas map[uint32]*replicaInode // nextIdx is the next pty index to use. Must be accessed atomically. // @@ -140,7 +170,7 @@ type rootInode struct { var _ kernfs.Inode = (*rootInode)(nil) // allocateTerminal creates a new Terminal and installs a pts node for it. -func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) { +func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credentials) (*Terminal, error) { i.mu.Lock() defer i.mu.Unlock() if i.nextIdx == math.MaxUint32 { @@ -149,41 +179,46 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) idx := i.nextIdx i.nextIdx++ - // Sanity check that slave with idx does not exist. - if _, ok := i.slaves[idx]; ok { + // Sanity check that replica with idx does not exist. + if _, ok := i.replicas[idx]; ok { panic(fmt.Sprintf("pty index collision; index %d already exists", idx)) } - // Create the new terminal and slave. + // Create the new terminal and replica. t := newTerminal(idx) - slave := &slaveInode{ + replica := &replicaInode{ root: i, t: t, } // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). - slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) - slave.dentry.Init(slave) - i.slaves[idx] = slave + replica.InodeAttrs.Init(ctx, creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) + i.replicas[idx] = replica return t, nil } // masterClose is called when the master end of t is closed. -func (i *rootInode) masterClose(t *Terminal) { +func (i *rootInode) masterClose(ctx context.Context, t *Terminal) { i.mu.Lock() defer i.mu.Unlock() - // Sanity check that slave with idx exists. - if _, ok := i.slaves[t.n]; !ok { + // Sanity check that replica with idx exists. + ri, ok := i.replicas[t.n] + if !ok { panic(fmt.Sprintf("pty with index %d does not exist", t.n)) } - delete(i.slaves, t.n) + + // Drop the ref on replica inode taken during rootInode.allocateTerminal. + ri.DecRef(ctx) + delete(i.replicas, t.n) } // Open implements kernfs.Inode.Open. -func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } @@ -191,27 +226,34 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D } // Lookup implements kernfs.Inode.Lookup. -func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { + // Check if a static entry was looked up. + if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil { + return d, nil + } + + // Not a static entry. idx, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, syserror.ENOENT } i.mu.Lock() defer i.mu.Unlock() - if si, ok := i.slaves[uint32(idx)]; ok { - si.dentry.IncRef() - return si.dentry.VFSDentry(), nil + if ri, ok := i.replicas[uint32(idx)]; ok { + ri.IncRef() // This ref is passed to the dentry upon creation via Init. + return ri, nil } return nil, syserror.ENOENT } // IterDirents implements kernfs.Inode.IterDirents. -func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { +func (i *rootInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { i.mu.Lock() defer i.mu.Unlock() - ids := make([]int, 0, len(i.slaves)) - for id := range i.slaves { + i.InodeAttrs.TouchAtime(ctx, mnt) + ids := make([]int, 0, len(i.replicas)) + for id := range i.replicas { ids = append(ids, int(id)) } sort.Ints(ids) @@ -219,7 +261,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(id), 10), Type: linux.DT_CHR, - Ino: i.slaves[uint32(id)].InodeAttrs.Ino(), + Ino: i.replicas[uint32(id)].InodeAttrs.Ino(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -229,3 +271,16 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, } return offset, nil } + +// DecRef implements kernfs.Inode.DecRef. +func (i *rootInode) DecRef(ctx context.Context) { + i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + +// +stateify savable +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil +} diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go index b7c149047..448390cfe 100644 --- a/pkg/sentry/fsimpl/devpts/devpts_test.go +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func TestSimpleMasterToSlave(t *testing.T) { - ld := newLineDiscipline(linux.DefaultSlaveTermios) +func TestSimpleMasterToReplica(t *testing.T) { + ld := newLineDiscipline(linux.DefaultReplicaTermios) ctx := contexttest.Context(t) inBytes := []byte("hello, tty\n") src := usermem.BytesIOSequence(inBytes) diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index f7bc325d1..ae95fdd08 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -41,7 +42,7 @@ const ( ) // lineDiscipline dictates how input and output are handled between the -// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// pseudoterminal (pty) master and replica. It can be configured to alter I/O, // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man // pages are good resources for how to affect the line discipline: // @@ -52,8 +53,8 @@ const ( // // lineDiscipline has a simple structure but supports a multitude of options // (see the above man pages). It consists of two queues of bytes: one from the -// terminal master to slave (the input queue) and one from slave to master (the -// output queue). When bytes are written to one end of the pty, the line +// terminal master to replica (the input queue) and one from replica to master +// (the output queue). When bytes are written to one end of the pty, the line // discipline reads the bytes, modifies them or takes special action if // required, and enqueues them to be read by the other end of the pty: // @@ -62,7 +63,7 @@ const ( // | (inputQueueWrite) +-------------+ (inputQueueRead) | // | | // | v -// masterFD slaveFD +// masterFD replicaFD // ^ | // | | // | output to terminal +--------------+ output from process | @@ -99,10 +100,10 @@ type lineDiscipline struct { column int // masterWaiter is used to wait on the master end of the TTY. - masterWaiter waiter.Queue `state:"zerovalue"` + masterWaiter waiter.Queue - // slaveWaiter is used to wait on the slave end of the TTY. - slaveWaiter waiter.Queue `state:"zerovalue"` + // replicaWaiter is used to wait on the replica end of the TTY. + replicaWaiter waiter.Queue } func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { @@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { } // getTermios gets the linux.Termios for the tty. -func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.RLock() defer l.termiosMu.RUnlock() // We must copy a Termios struct, not KernelTermios. t := l.termios.ToTermios() - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := t.CopyOut(task, args[2].Pointer()) return 0, err } // setTermios sets a linux.Termios for the tty. -func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) { l.termiosMu.Lock() defer l.termiosMu.Unlock() oldCanonEnabled := l.termios.LEnabled(linux.ICANON) // We must copy a Termios struct, not KernelTermios. var t linux.Termios - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := t.CopyIn(task, args[2].Pointer()) l.termios.FromTermios(t) // If canonical mode is turned off, move bytes from inQueue's wait @@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc l.inQueue.pushWaitBufLocked(l) l.inQueue.readable = true l.inQueue.mu.Unlock() - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) } return 0, err } -func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := l.size.CopyOut(t, args[2].Pointer()) return err } -func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error { l.sizeMu.Lock() defer l.sizeMu.Unlock() - _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := l.size.CopyIn(t, args[2].Pointer()) return err } @@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask { return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) } -func (l *lineDiscipline) slaveReadiness() waiter.EventMask { +func (l *lineDiscipline) replicaReadiness() waiter.EventMask { l.termiosMu.RLock() defer l.termiosMu.RUnlock() return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) } -func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { - return l.inQueue.readableSize(ctx, io, args) +func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { + return l.inQueue.readableSize(t, io, args) } func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { @@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque if n > 0 { l.masterWaiter.Notify(waiter.EventOut) if pushed { - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) } return n, nil } @@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ return 0, err } if n > 0 { - l.slaveWaiter.Notify(waiter.EventIn) + l.replicaWaiter.Notify(waiter.EventIn) return n, nil } return 0, syserror.ErrWouldBlock } -func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { - return l.outQueue.readableSize(ctx, io, args) +func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { + return l.outQueue.readableSize(t, io, args) } func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { @@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ return 0, err } if n > 0 { - l.slaveWaiter.Notify(waiter.EventOut) + l.replicaWaiter.Notify(waiter.EventOut) if pushed { l.masterWaiter.Notify(waiter.EventIn) } diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 7a7ce5d81..e91fa26a4 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -17,8 +17,11 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -28,14 +31,16 @@ import ( ) // masterInode is the inode for the master end of the Terminal. +// +// +stateify savable type masterInode struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotDirectory kernfs.InodeNotSymlink - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry + locks vfs.FileLocks // root is the devpts root inode. root *rootInode @@ -44,27 +49,26 @@ type masterInode struct { var _ kernfs.Inode = (*masterInode)(nil) // Open implements kernfs.Inode.Open. -func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - t, err := mi.root.allocateTerminal(rp.Credentials()) +func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + t, err := mi.root.allocateTerminal(ctx, rp.Credentials()) if err != nil { return nil, err } - mi.IncRef() fd := &masterFileDescription{ inode: mi, t: t, } - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - mi.DecRef() + fd.LockFD.Init(&mi.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil } // Stat implements kernfs.Inode.Stat. -func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - statx, err := mi.InodeAttrs.Stat(vfsfs, opts) +func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts) if err != nil { return linux.Statx{}, err } @@ -82,9 +86,11 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) } +// +stateify savable type masterFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD inode *masterInode t *Terminal @@ -93,9 +99,8 @@ type masterFileDescription struct { var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (mfd *masterFileDescription) Release() { - mfd.inode.root.masterClose(mfd.t) - mfd.inode.DecRef() +func (mfd *masterFileDescription) Release(ctx context.Context) { + mfd.inode.root.masterClose(ctx, mfd.t) } // EventRegister implements waiter.Waitable.EventRegister. @@ -125,46 +130,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque // Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // ioctl(2) may only be called from a task goroutine. + return 0, syserror.ENOTTY + } + switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ // Get the number of bytes in the output queue read buffer. - return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args) + return 0, mfd.t.ld.outputQueueReadSize(t, io, args) case linux.TCGETS: // N.B. TCGETS on the master actually returns the configuration - // of the slave end. - return mfd.t.ld.getTermios(ctx, io, args) + // of the replica end. + return mfd.t.ld.getTermios(t, args) case linux.TCSETS: // N.B. TCSETS on the master actually affects the configuration - // of the slave end. - return mfd.t.ld.setTermios(ctx, io, args) + // of the replica end. + return mfd.t.ld.setTermios(t, args) case linux.TCSETSW: // TODO(b/29356795): This should drain the output queue first. - return mfd.t.ld.setTermios(ctx, io, args) + return mfd.t.ld.setTermios(t, args) case linux.TIOCGPTN: - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{ - AddressSpaceActive: true, - }) + nP := primitive.Uint32(mfd.t.n) + _, err := nP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCSPTLCK: // TODO(b/29356795): Implement pty locking. For now just pretend we do. return 0, nil case linux.TIOCGWINSZ: - return 0, mfd.t.ld.windowSize(ctx, io, args) + return 0, mfd.t.ld.windowSize(t, args) case linux.TIOCSWINSZ: - return 0, mfd.t.ld.setWindowSize(ctx, io, args) + return 0, mfd.t.ld.setWindowSize(t, args) case linux.TIOCSCTTY: // Make the given terminal the controlling terminal of the // calling process. - return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */) + return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */) case linux.TIOCNOTTY: // Release this process's controlling terminal. - return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */) case linux.TIOCGPGRP: // Get the foreground process group. - return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */) case linux.TIOCSPGRP: // Set the foreground process group. - return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) + return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */) default: maybeEmitUnimplementedEvent(ctx, cmd) return 0, syserror.ENOTTY @@ -181,7 +191,17 @@ func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatO // Stat implements vfs.FileDescriptionImpl.Stat. func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() - return mfd.inode.Stat(fs, opts) + return mfd.inode.Stat(ctx, fs, opts) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence) } // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index dffb4232c..55bff3e60 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -17,8 +17,10 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -30,7 +32,7 @@ import ( const waitBufMaxBytes = 131072 // queue represents one of the input or output queues between a pty master and -// slave. Bytes written to a queue are added to the read buffer until it is +// replica. Bytes written to a queue are added to the read buffer until it is // full, at which point they are written to the wait buffer. Bytes are // processed (i.e. undergo termios transformations) as they are added to the // read buffer. The read buffer is readable when its length is nonzero and @@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { } // readableSize writes the number of readable bytes to userspace. -func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { +func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { q.mu.Lock() defer q.mu.Unlock() - var size int32 + size := primitive.Int32(0) if q.readable { - size = int32(len(q.readBuf)) + size = primitive.Int32(len(q.readBuf)) } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := size.CopyOut(t, args[2].Pointer()) return err } @@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca // as whether the read caused more readable data to become available (whether // data was pushed from the wait buffer to the read buffer). // -// Preconditions: -// * l.termiosMu must be held for reading. +// Preconditions: l.termiosMu must be held for reading. func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { q.mu.Lock() defer q.mu.Unlock() @@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl // write writes to q from userspace. // -// Preconditions: -// * l.termiosMu must be held for reading. +// Preconditions: l.termiosMu must be held for reading. func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { q.mu.Lock() defer q.mu.Unlock() @@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip // writeBytes writes to q from b. // -// Preconditions: -// * l.termiosMu must be held for reading. +// Preconditions: l.termiosMu must be held for reading. func (q *queue) writeBytes(b []byte, l *lineDiscipline) { q.mu.Lock() defer q.mu.Unlock() diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go new file mode 100644 index 000000000..70c68cf0a --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -0,0 +1,201 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// replicaInode is the inode for the replica end of the Terminal. +// +// +stateify savable +type replicaInode struct { + implStatFS + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + locks vfs.FileLocks + + // root is the devpts root inode. + root *rootInode + + // t is the connected Terminal. + t *Terminal +} + +var _ kernfs.Inode = (*replicaInode)(nil) + +// Open implements kernfs.Inode.Open. +func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &replicaFileDescription{ + inode: ri, + } + fd.LockFD.Init(&ri.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + +} + +// Valid implements kernfs.Inode.Valid. +func (ri *replicaInode) Valid(context.Context) bool { + // Return valid if the replica still exists. + ri.root.mu.Lock() + defer ri.root.mu.Unlock() + _, ok := ri.root.replicas[ri.t.n] + return ok +} + +// Stat implements kernfs.Inode.Stat. +func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR + statx.RdevMinor = ri.t.n + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +// +stateify savable +type replicaFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + inode *replicaInode +} + +var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil) + +// Release implements fs.FileOperations.Release. +func (rfd *replicaFileDescription) Release(ctx context.Context) {} + +// EventRegister implements waiter.Waitable.EventRegister. +func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) { + rfd.inode.t.ld.replicaWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return rfd.inode.t.ld.replicaReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return rfd.inode.t.ld.inputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return rfd.inode.t.ld.outputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // ioctl(2) may only be called from a task goroutine. + return 0, syserror.ENOTTY + } + + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the input queue read buffer. + return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args) + case linux.TCGETS: + return rfd.inode.t.ld.getTermios(t, args) + case linux.TCSETS: + return rfd.inode.t.ld.setTermios(t, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return rfd.inode.t.ld.setTermios(t, args) + case linux.TIOCGPTN: + nP := primitive.Uint32(rfd.inode.t.n) + _, err := nP.CopyOut(t, args[2].Pointer()) + return 0, err + case linux.TIOCGWINSZ: + return 0, rfd.inode.t.ld.windowSize(t, args) + case linux.TIOCSWINSZ: + return 0, rfd.inode.t.ld.setWindowSize(t, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() + return rfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem() + return rfd.inode.Stat(ctx, fs, opts) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go deleted file mode 100644 index 526cd406c..000000000 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package devpts - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// slaveInode is the inode for the slave end of the Terminal. -type slaveInode struct { - kernfs.InodeAttrs - kernfs.InodeNoopRefCount - kernfs.InodeNotDirectory - kernfs.InodeNotSymlink - - // Keep a reference to this inode's dentry. - dentry kernfs.Dentry - - // root is the devpts root inode. - root *rootInode - - // t is the connected Terminal. - t *Terminal -} - -var _ kernfs.Inode = (*slaveInode)(nil) - -// Open implements kernfs.Inode.Open. -func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - si.IncRef() - fd := &slaveFileDescription{ - inode: si, - } - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - si.DecRef() - return nil, err - } - return &fd.vfsfd, nil - -} - -// Valid implements kernfs.Inode.Valid. -func (si *slaveInode) Valid(context.Context) bool { - // Return valid if the slave still exists. - si.root.mu.Lock() - defer si.root.mu.Unlock() - _, ok := si.root.slaves[si.t.n] - return ok -} - -// Stat implements kernfs.Inode.Stat. -func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - statx, err := si.InodeAttrs.Stat(vfsfs, opts) - if err != nil { - return linux.Statx{}, err - } - statx.Blksize = 1024 - statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR - statx.RdevMinor = si.t.n - return statx, nil -} - -// SetStat implements kernfs.Inode.SetStat -func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - if opts.Stat.Mask&linux.STATX_SIZE != 0 { - return syserror.EINVAL - } - return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) -} - -type slaveFileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl - - inode *slaveInode -} - -var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) - -// Release implements fs.FileOperations.Release. -func (sfd *slaveFileDescription) Release() { - sfd.inode.DecRef() -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) { - sfd.inode.t.ld.slaveWaiter.EventUnregister(e) -} - -// Readiness implements waiter.Waitable.Readiness. -func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - return sfd.inode.t.ld.slaveReadiness() -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { - return sfd.inode.t.ld.inputQueueRead(ctx, dst) -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { - return sfd.inode.t.ld.outputQueueWrite(ctx, src) -} - -// Ioctl implements vfs.FileDescripionImpl.Ioctl. -func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { - switch cmd := args[1].Uint(); cmd { - case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ - // Get the number of bytes in the input queue read buffer. - return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args) - case linux.TCGETS: - return sfd.inode.t.ld.getTermios(ctx, io, args) - case linux.TCSETS: - return sfd.inode.t.ld.setTermios(ctx, io, args) - case linux.TCSETSW: - // TODO(b/29356795): This should drain the output queue first. - return sfd.inode.t.ld.setTermios(ctx, io, args) - case linux.TIOCGPTN: - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{ - AddressSpaceActive: true, - }) - return 0, err - case linux.TIOCGWINSZ: - return 0, sfd.inode.t.ld.windowSize(ctx, io, args) - case linux.TIOCSWINSZ: - return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args) - case linux.TIOCSCTTY: - // Make the given terminal the controlling terminal of the - // calling process. - return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */) - case linux.TIOCNOTTY: - // Release this process's controlling terminal. - return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) - case linux.TIOCGPGRP: - // Get the foreground process group. - return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) - case linux.TIOCSPGRP: - // Set the foreground process group. - return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) - default: - maybeEmitUnimplementedEvent(ctx, cmd) - return 0, syserror.ENOTTY - } -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - creds := auth.CredentialsFromContext(ctx) - fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() - return sfd.inode.SetStat(ctx, fs, creds, opts) -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() - return sfd.inode.Stat(fs, opts) -} diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go index 7d2781c54..510bd6d89 100644 --- a/pkg/sentry/fsimpl/devpts/terminal.go +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -17,9 +17,9 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/usermem" ) // Terminal is a pseudoterminal. @@ -36,25 +36,25 @@ type Terminal struct { // this terminal. This field is immutable. masterKTTY *kernel.TTY - // slaveKTTY contains the controlling process of the slave end of this + // replicaKTTY contains the controlling process of the replica end of this // terminal. This field is immutable. - slaveKTTY *kernel.TTY + replicaKTTY *kernel.TTY } func newTerminal(n uint32) *Terminal { - termios := linux.DefaultSlaveTermios + termios := linux.DefaultReplicaTermios t := Terminal{ - n: n, - ld: newLineDiscipline(termios), - masterKTTY: &kernel.TTY{Index: n}, - slaveKTTY: &kernel.TTY{Index: n}, + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + replicaKTTY: &kernel.TTY{Index: n}, } return &t } // setControllingTTY makes tm the controlling terminal of the calling thread // group. -func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { +func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error { task := kernel.TaskFromContext(ctx) if task == nil { panic("setControllingTTY must be called from a task context") @@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a // releaseControllingTTY removes tm as the controlling terminal of the calling // thread group. -func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { +func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error { task := kernel.TaskFromContext(ctx) if task == nil { panic("releaseControllingTTY must be called from a task context") @@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar } // foregroundProcessGroup gets the process group ID of tm's foreground process. -func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { panic("foregroundProcessGroup must be called from a task context") @@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a } // Write it out to *arg. - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ - AddressSpaceActive: true, - }) + retP := primitive.Int32(ret) + _, err = retP.CopyOut(task, args[2].Pointer()) return 0, err } // foregroundProcessGroup sets tm's foreground process. -func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) { task := kernel.TaskFromContext(ctx) if task == nil { panic("setForegroundProcessGroup must be called from a task context") } // Read in the process group ID. - var pgid int32 - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + var pgid primitive.Int32 + if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } @@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { if isMaster { return tm.masterKTTY } - return tm.slaveKTTY + return tm.replicaKTTY } diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD index aa0c2ad8c..e49a04c1b 100644 --- a/pkg/sentry/fsimpl/devtmpfs/BUILD +++ b/pkg/sentry/fsimpl/devtmpfs/BUILD @@ -4,7 +4,10 @@ licenses(["notice"]) go_library( name = "devtmpfs", - srcs = ["devtmpfs.go"], + srcs = [ + "devtmpfs.go", + "save_restore.go", + ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -24,6 +27,7 @@ go_test( library = ":devtmpfs", deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/tmpfs", diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index 142ee53b0..e6fe0fc0d 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -18,6 +18,7 @@ package devtmpfs import ( "fmt" + "path" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -32,8 +33,10 @@ import ( const Name = "devtmpfs" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct { - initOnce sync.Once + initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported. initErr error // fs is the tmpfs filesystem that backs all mounts of this FilesystemType. @@ -68,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua return fst.fs, fst.root, nil } +// Release implements vfs.FilesystemType.Release. +func (fst *FilesystemType) Release(ctx context.Context) { + if fst.fs != nil { + // Release the original reference obtained when creating the filesystem. + fst.root.DecRef(ctx) + fst.fs.DecRef(ctx) + } +} + // Accessor allows devices to create device special files in devtmpfs. type Accessor struct { vfsObj *vfs.VirtualFilesystem @@ -79,22 +91,25 @@ type Accessor struct { // NewAccessor returns an Accessor that supports creation of device special // files in the devtmpfs instance registered with name fsTypeName in vfsObj. func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) { - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{}) if err != nil { return nil, err } + // Pass a reference on root to the Accessor. + root := mntns.Root() + root.IncRef() return &Accessor{ vfsObj: vfsObj, mntns: mntns, - root: mntns.Root(), + root: root, creds: creds, }, nil } // Release must be called when a is no longer in use. -func (a *Accessor) Release() { - a.root.DecRef() - a.mntns.DecRef() +func (a *Accessor) Release(ctx context.Context) { + a.root.DecRef(ctx) + a.mntns.DecRef(ctx) } // accessorContext implements context.Context by extending an existing @@ -136,6 +151,8 @@ func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation { // CreateDeviceFile creates a device special file at the given pathname in the // devtmpfs instance accessed by the Accessor. func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error { + actx := a.wrapContext(ctx) + mode := (linux.FileMode)(perms) switch kind { case vfs.BlockDevice: @@ -145,12 +162,22 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v default: panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind)) } + + // Create any parent directories. See + // devtmpfs.c:handle_create()=>path_create(). + parent := path.Dir(pathname) + if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + return fmt.Errorf("failed to create device parent directory %q: %v", parent, err) + } + // NOTE: Linux's devtmpfs refuses to automatically delete files it didn't // create, which it recognizes by storing a pointer to the kdevtmpfs struct // thread in struct inode::i_private. Accessor doesn't yet support deletion // of files at all, and probably won't as long as we don't need to support // kernel modules, so this is moot for now. - return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{ + return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{ Mode: mode, DevMajor: major, DevMinor: minor, diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index b6d52c015..e058eda7a 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -15,9 +15,11 @@ package devtmpfs import ( + "path" "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" @@ -25,12 +27,15 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -func TestDevtmpfs(t *testing.T) { +const devPath = "/dev" + +func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) { + t.Helper() + ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Register tmpfs just so that we can have a root filesystem that isn't @@ -43,14 +48,12 @@ func TestDevtmpfs(t *testing.T) { }) // Create a test mount namespace with devtmpfs mounted at "/dev". - const devPath = "/dev" - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() root := mntns.Root() - defer root.DecRef() + root.IncRef() devpop := vfs.PathOperation{ Root: root, Start: root, @@ -61,62 +64,167 @@ func TestDevtmpfs(t *testing.T) { }); err != nil { t.Fatalf("failed to create mount point: %v", err) } - if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil { + if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil { t.Fatalf("failed to mount devtmpfs: %v", err) } + return ctx, creds, vfsObj, root, func() { + root.DecRef(ctx) + mntns.DecRef(ctx) + } +} + +func TestUserspaceInit(t *testing.T) { + ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t) + defer cleanup() + a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs") if err != nil { t.Fatalf("failed to create devtmpfs.Accessor: %v", err) } - defer a.Release() + defer a.Release(ctx) // Create "userspace-initialized" files using a devtmpfs.Accessor. if err := a.UserspaceInit(ctx); err != nil { t.Fatalf("failed to userspace-initialize devtmpfs: %v", err) } + // Created files should be visible in the test mount namespace. - abspath := devPath + "/fd" - target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(abspath), - }) - if want := "/proc/self/fd"; err != nil || target != want { - t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want) + links := []struct { + source string + target string + }{ + { + source: "fd", + target: "/proc/self/fd", + }, + { + source: "stdin", + target: "/proc/self/fd/0", + }, + { + source: "stdout", + target: "/proc/self/fd/1", + }, + { + source: "stderr", + target: "/proc/self/fd/2", + }, + { + source: "ptmx", + target: "pts/ptmx", + }, } - // Create a dummy device special file using a devtmpfs.Accessor. - const ( - pathInDev = "dummy" - kind = vfs.CharDevice - major = 12 - minor = 34 - perms = 0600 - wantMode = linux.S_IFCHR | perms - ) - if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil { - t.Fatalf("failed to create device file: %v", err) + for _, link := range links { + abspath := path.Join(devPath, link.source) + if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }); err != nil || gotTarget != link.target { + t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target) + } } - // The device special file should be visible in the test mount namespace. - abspath = devPath + "/" + pathInDev - stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(abspath), - }, &vfs.StatOptions{ - Mask: linux.STATX_TYPE | linux.STATX_MODE, - }) - if err != nil { - t.Fatalf("failed to stat device file at %q: %v", abspath, err) + + dirs := []string{"shm", "pts"} + for _, dir := range dirs { + abspath := path.Join(devPath, dir) + statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }, &vfs.StatOptions{ + Mask: linux.STATX_MODE, + }) + if err != nil { + t.Errorf("stat(%q): got error %v ", abspath, err) + continue + } + if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want { + t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want) + } } - if stat.Mode != wantMode { - t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode) +} + +func TestCreateDeviceFile(t *testing.T) { + ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t) + defer cleanup() + + a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs") + if err != nil { + t.Fatalf("failed to create devtmpfs.Accessor: %v", err) } - if stat.RdevMajor != major { - t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major) + defer a.Release(ctx) + + devFiles := []struct { + path string + kind vfs.DeviceKind + major uint32 + minor uint32 + perms uint16 + }{ + { + path: "dummy", + kind: vfs.CharDevice, + major: 12, + minor: 34, + perms: 0600, + }, + { + path: "foo/bar", + kind: vfs.BlockDevice, + major: 13, + minor: 35, + perms: 0660, + }, + { + path: "foo/baz", + kind: vfs.CharDevice, + major: 12, + minor: 40, + perms: 0666, + }, + { + path: "a/b/c/d/e", + kind: vfs.BlockDevice, + major: 12, + minor: 34, + perms: 0600, + }, } - if stat.RdevMinor != minor { - t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor) + + for _, f := range devFiles { + if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil { + t.Fatalf("failed to create device file: %v", err) + } + // The device special file should be visible in the test mount namespace. + abspath := path.Join(devPath, f.path) + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }, &vfs.StatOptions{ + Mask: linux.STATX_TYPE | linux.STATX_MODE, + }) + if err != nil { + t.Fatalf("failed to stat device file at %q: %v", abspath, err) + } + if stat.RdevMajor != f.major { + t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major) + } + if stat.RdevMinor != f.minor { + t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor) + } + wantMode := f.perms + switch f.kind { + case vfs.CharDevice: + wantMode |= linux.S_IFCHR + case vfs.BlockDevice: + wantMode |= linux.S_IFBLK + } + if stat.Mode != wantMode { + t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode) + } } } diff --git a/pkg/sentry/fsimpl/devtmpfs/save_restore.go b/pkg/sentry/fsimpl/devtmpfs/save_restore.go new file mode 100644 index 000000000..28832d850 --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/save_restore.go @@ -0,0 +1,23 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devtmpfs + +// afterLoad is invoked by stateify. +func (fst *FilesystemType) afterLoad() { + if fst.fs != nil { + // Ensure that we don't create another filesystem. + fst.initOnce.Do(func() {}) + } +} diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index c573d7935..5b29f2358 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -30,17 +30,20 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// EventFileDescription implements FileDescriptionImpl for file-based event +// EventFileDescription implements vfs.FileDescriptionImpl for file-based event // notification (eventfd). Eventfds are usually internal to the Sentry but in // certain situations they may be converted into a host-backed eventfd. +// +// +stateify savable type EventFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // queue is used to notify interested parties when the event object // becomes readable or writable. - queue waiter.Queue `state:"zerovalue"` + queue waiter.Queue // mu protects the fields below. mu sync.Mutex `state:"nosave"` @@ -58,9 +61,9 @@ type EventFileDescription struct { var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) // New creates a new event fd. -func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) efd := &EventFileDescription{ val: initVal, semMode: semMode, @@ -105,8 +108,8 @@ func (efd *EventFileDescription) HostFD() (int, error) { return efd.hostfd, nil } -// Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { +// Release implements vfs.FileDescriptionImpl.Release. +func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { @@ -118,7 +121,7 @@ func (efd *EventFileDescription) Release() { } } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { if dst.NumBytes() < 8 { return 0, syscall.EINVAL @@ -129,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc return 8, nil } -// Write implements FileDescriptionImpl.Write. +// Write implements vfs.FileDescriptionImpl.Write. func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { if src.NumBytes() < 8 { return 0, syscall.EINVAL diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go index 20e3adffc..49916fa81 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd_test.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -36,16 +36,16 @@ func TestEventFD(t *testing.T) { for _, initVal := range initVals { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) // Register a callback for a write event. w, ch := waiter.NewChannelEntry(nil) @@ -74,16 +74,16 @@ func TestEventFD(t *testing.T) { func TestEventFDStat(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) statx, err := eventfd.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index ff861d0fe..7b1eec3da 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -51,9 +51,12 @@ go_library( "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", @@ -85,9 +88,9 @@ go_test( library = ":ext", deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/fspath", + "//pkg/marshal/primitive", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", @@ -95,7 +98,7 @@ go_test( "//pkg/syserror", "//pkg/test/testutil", "//pkg/usermem", - "@com_github_google_go-cmp//cmp:go_default_library", - "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp/cmpopts:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 89caee3df..2ee7cc7ac 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -53,22 +53,27 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, nil, nil, nil, err } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }) if err != nil { f.Close() return nil, nil, nil, nil, err } root := mntns.Root() + root.IncRef() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { b.Fatalf("tearDown failed: %v", err) @@ -90,7 +95,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf ctx := contexttest.Context(b) creds := auth.CredentialsFromContext(ctx) - if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ + if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ InternalData: int(f.Fd()), }, @@ -169,7 +174,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create extfs submount. mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go index a2d8c3ad6..1165234f9 100644 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -18,7 +18,7 @@ import ( "io" "math" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/syserror" ) @@ -34,19 +34,19 @@ type blockMapFile struct { // directBlks are the direct blocks numbers. The physical blocks pointed by // these holds file data. Contains file blocks 0 to 11. - directBlks [numDirectBlks]uint32 + directBlks [numDirectBlks]primitive.Uint32 // indirectBlk is the physical block which contains (blkSize/4) direct block // numbers (as uint32 integers). - indirectBlk uint32 + indirectBlk primitive.Uint32 // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect // block numbers (as uint32 integers). - doubleIndirectBlk uint32 + doubleIndirectBlk primitive.Uint32 // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly // indirect block numbers (as uint32 integers). - tripleIndirectBlk uint32 + tripleIndirectBlk primitive.Uint32 // coverage at (i)th index indicates the amount of file data a node at // height (i) covers. Height 0 is the direct block. @@ -58,19 +58,22 @@ var _ io.ReaderAt = (*blockMapFile)(nil) // newBlockMapFile is the blockMapFile constructor. It initializes the file to // physical blocks map with (at most) the first 12 (direct) blocks. -func newBlockMapFile(regFile regularFile) (*blockMapFile, error) { - file := &blockMapFile{regFile: regFile} +func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { + file := &blockMapFile{} file.regFile.impl = file + file.regFile.inode.init(args, &file.regFile) for i := uint(0); i < 4; i++ { - file.coverage[i] = getCoverage(regFile.inode.blkSize, i) + file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i) } - blkMap := regFile.inode.diskInode.Data() - binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks) - binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk) - binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk) - binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk) + blkMap := file.regFile.inode.diskInode.Data() + for i := 0; i < numDirectBlks; i++ { + file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4]) + } + file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4]) + file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4]) + file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4]) return file, nil } @@ -116,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) { switch { case offset < dirBlksEnd: // Direct block. - curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:]) + curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:]) case offset < indirBlkEnd: // Indirect block. - curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:]) + curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:]) case offset < doubIndirBlkEnd: // Doubly indirect block. - curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:]) + curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:]) default: // Triply indirect block. - curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:]) + curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:]) } read += curR @@ -173,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds read := 0 curChildOff := relFileOff % childCov for i := startIdx; i < endIdx; i++ { - var childPhyBlk uint32 + var childPhyBlk primitive.Uint32 err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) if err != nil { return read, err } - n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:]) + n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:]) read += n if err != nil { return read, err diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go index 181727ef7..ed98b482e 100644 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -20,7 +20,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" ) @@ -85,49 +85,50 @@ func (n *blkNumGen) next() uint32 { // the inode covers and that is written to disk. func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { mockDisk := make([]byte, mockBMDiskSize) - regFile := regularFile{ - inode: inode{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: getMockBMFileFize(), - }, - }, - blkSize: uint64(mockBMBlkSize), - }, - } - var fileData []byte blkNums := newBlkNumGen() - var data []byte + off := 0 + data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes()) // Write the direct blocks. for i := 0; i < numDirectBlks; i++ { - curBlkNum := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, curBlkNum) - fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...) + curBlkNum := primitive.Uint32(blkNums.next()) + curBlkNum.MarshalBytes(data[off:]) + off += curBlkNum.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...) } // Write to indirect block. - indirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, indirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...) - - // Write to indirect block. - doublyIndirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...) - - // Write to indirect block. - triplyIndirectBlk := blkNums.next() - data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk) - fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...) - - copy(regFile.inode.diskInode.Data(), data) + indirectBlk := primitive.Uint32(blkNums.next()) + indirectBlk.MarshalBytes(data[off:]) + off += indirectBlk.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...) + + // Write to double indirect block. + doublyIndirectBlk := primitive.Uint32(blkNums.next()) + doublyIndirectBlk.MarshalBytes(data[off:]) + off += doublyIndirectBlk.SizeBytes() + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...) + + // Write to triple indirect block. + triplyIndirectBlk := primitive.Uint32(blkNums.next()) + triplyIndirectBlk.MarshalBytes(data[off:]) + fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...) + + args := inodeArgs{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: getMockBMFileFize(), + }, + }, + blkSize: uint64(mockBMBlkSize), + } + copy(args.diskInode.Data(), data) - mockFile, err := newBlockMapFile(regFile) + mockFile, err := newBlockMapFile(args) if err != nil { t.Fatalf("newBlockMapFile failed: %v", err) } @@ -145,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN var fileData []byte for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 { - curBlkNum := blkNums.next() - copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum)) - fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...) + curBlkNum := primitive.Uint32(blkNums.next()) + curBlkNum.MarshalBytes(disk[off : off+4]) + fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...) } return fileData } diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 6bd1a9fc6..9bfed883a 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -15,10 +15,13 @@ package ext import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" ) // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -55,7 +58,7 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { // FIXME(b/134676337): filesystem.mu may not be locked as required by // inode.decRef(). d.inode.decRef() @@ -63,12 +66,17 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} +// TODO(b/134676337): Implement inotify. +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // -// TODO(gvisor.dev/issue/1479): Implement inotify. +// TODO(b/134676337): Implement inotify. func (d *dentry) Watches() *vfs.Watches { return nil } + +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +// +// TODO(b/134676337): Implement inotify. +func (d *dentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 12b875c8f..0ad79b381 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -16,10 +16,10 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -27,6 +27,8 @@ import ( ) // directory represents a directory inode. It holds the childList in memory. +// +// +stateify savable type directory struct { inode inode @@ -38,7 +40,7 @@ type directory struct { // Lock Order (outermost locks must be taken first): // directory.mu // filesystem.mu - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // childList is a list containing (1) child dirents and (2) fake dirents // (with diskDirent == nil) that represent the iteration position of @@ -54,16 +56,15 @@ type directory struct { } // newDirectory is the directory constructor. -func newDirectory(inode inode, newDirent bool) (*directory, error) { +func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { file := &directory{ - inode: inode, childCache: make(map[string]*dentry), childMap: make(map[string]*dirent), } - file.inode.impl = file + file.inode.init(args, file) // Initialize childList by reading dirents from the underlying file. - if inode.diskInode.Flags().Index { + if args.diskInode.Flags().Index { // TODO(b/134676337): Support hash tree directories. Currently only the '.' // and '..' entries are read in. @@ -74,7 +75,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) { // The dirents are organized in a linear array in the file data. // Extract the file data and decode the dirents. - regFile, err := newRegularFile(inode) + regFile, err := newRegularFile(args) if err != nil { return nil, err } @@ -82,7 +83,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) { // buf is used as scratch space for reading in dirents from disk and // unmarshalling them into dirent structs. buf := make([]byte, disklayout.DirentSize) - size := inode.diskInode.Size() + size := args.diskInode.Size() for off, inc := uint64(0), uint64(0); off < size; off += inc { toRead := size - off if toRead > disklayout.DirentSize { @@ -98,7 +99,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) { } else { curDirent.diskDirent = &disklayout.DirentOld{} } - binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent) + curDirent.diskDirent.UnmarshalBytes(buf) if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 { // Inode number and name length fields being set to 0 is used to indicate @@ -120,6 +121,8 @@ func (i *inode) isDir() bool { } // dirent is the directory.childList node. +// +// +stateify savable type dirent struct { diskDirent disklayout.Dirent @@ -129,6 +132,8 @@ type dirent struct { // directoryFD represents a directory file description. It implements // vfs.FileDescriptionImpl. +// +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl @@ -142,7 +147,7 @@ type directoryFD struct { var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter == nil { return } @@ -306,3 +311,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD index 9bd9c76c0..d98a05dd8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -22,10 +22,11 @@ go_library( "superblock_old.go", "test_utils.go", ], + marshal = True, visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", + "//pkg/marshal", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go index ad6f4fef8..0d56ae9da 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + // BlockGroup represents a Linux ext block group descriptor. An ext file system // is split into a series of block groups. This provides an access layer to // information needed to access and use a block group. @@ -30,6 +34,8 @@ package disklayout // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors. type BlockGroup interface { + marshal.Marshallable + // InodeTable returns the absolute block number of the block containing the // inode table. This points to an array of Inode structs. Inode tables are // statically allocated at mkfs time. The superblock records the number of diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go index 3e16c76db..a35fa22a0 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go @@ -17,6 +17,8 @@ package disklayout // BlockGroup32Bit emulates the first half of struct ext4_group_desc in // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and // 32-bit ext4 filesystems. It implements BlockGroup interface. +// +// +marshal type BlockGroup32Bit struct { BlockBitmapLo uint32 InodeBitmapLo uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go index 9a809197a..d54d1d345 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go @@ -18,6 +18,8 @@ package disklayout // It is the block group descriptor struct for 64-bit ext4 filesystems. // It implements BlockGroup interface. It is an extension of the 32-bit // version of BlockGroup. +// +// +marshal type BlockGroup64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go index 0ef4294c0..e4ce484e4 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go @@ -21,6 +21,8 @@ import ( // TestBlockGroupSize tests that the block group descriptor structs are of the // correct size. func TestBlockGroupSize(t *testing.T) { - assertSize(t, BlockGroup32Bit{}, 32) - assertSize(t, BlockGroup64Bit{}, 64) + var bgSmall BlockGroup32Bit + assertSize(t, &bgSmall, 32) + var bgBig BlockGroup64Bit + assertSize(t, &bgBig, 64) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go index 417b6cf65..568c8cb4c 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go @@ -15,6 +15,7 @@ package disklayout import ( + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fs" ) @@ -51,6 +52,8 @@ var ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories. type Dirent interface { + marshal.Marshallable + // Inode returns the absolute inode number of the underlying inode. // Inode number 0 signifies an unused dirent. Inode() uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go index 29ae4a5c2..51f9c2946 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go @@ -29,12 +29,14 @@ import ( // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain // garbage. +// +// +marshal type DirentNew struct { InodeNumber uint32 RecordLength uint16 NameLength uint8 FileTypeRaw uint8 - FileNameRaw [MaxFileName]byte + FileNameRaw [MaxFileName]byte `marshal:"unaligned"` } // Compiles only if DirentNew implements Dirent. diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go index 6fff12a6e..d4b19e086 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go @@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs" // Note: This struct can be of variable size on disk. The one described below // is of maximum size and the FileName beyond NameLength bytes might contain // garbage. +// +// +marshal type DirentOld struct { InodeNumber uint32 RecordLength uint16 NameLength uint16 - FileNameRaw [MaxFileName]byte + FileNameRaw [MaxFileName]byte `marshal:"unaligned"` } // Compiles only if DirentOld implements Dirent. diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go index 934919f8a..3486864dc 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go @@ -21,6 +21,8 @@ import ( // TestDirentSize tests that the dirent structs are of the correct // size. func TestDirentSize(t *testing.T) { - assertSize(t, DirentOld{}, uintptr(DirentSize)) - assertSize(t, DirentNew{}, uintptr(DirentSize)) + var dOld DirentOld + assertSize(t, &dOld, DirentSize) + var dNew DirentNew + assertSize(t, &dNew, DirentSize) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go index bdf4e2132..0834e9ba8 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go +++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go @@ -36,8 +36,6 @@ // escape analysis on an unknown implementation at compile time. // // Notes: -// - All fields in these structs are exported because binary.Read would -// panic otherwise. // - All structures on disk are in little-endian order. Only jbd2 (journal) // structures are in big-endian order. // - All OS dependent fields in these structures will be interpretted using diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go index 4110649ab..b13999bfc 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + // Extents were introduced in ext4 and provide huge performance gains in terms // data locality and reduced metadata block usage. Extents are organized in // extent trees. The root node is contained in inode.BlocksRaw. @@ -64,6 +68,8 @@ type ExtentNode struct { // ExtentEntry represents an extent tree node entry. The entry can either be // an ExtentIdx or Extent itself. This exists to simplify navigation logic. type ExtentEntry interface { + marshal.Marshallable + // FileBlock returns the first file block number covered by this entry. FileBlock() uint32 @@ -75,6 +81,8 @@ type ExtentEntry interface { // tree node begins with this and is followed by `NumEntries` number of: // - Extent if `Depth` == 0 // - ExtentIdx otherwise +// +// +marshal type ExtentHeader struct { // Magic in the extent magic number, must be 0xf30a. Magic uint16 @@ -96,6 +104,8 @@ type ExtentHeader struct { // internal nodes. Sorted in ascending order based on FirstFileBlock since // Linux does a binary search on this. This points to a block containing the // child node. +// +// +marshal type ExtentIdx struct { FirstFileBlock uint32 ChildBlockLo uint32 @@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 { // nodes. Sorted in ascending order based on FirstFileBlock since Linux does a // binary search on this. This points to an array of data blocks containing the // file data. It covers `Length` data blocks starting from `StartBlock`. +// +// +marshal type Extent struct { FirstFileBlock uint32 Length uint16 diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go index 8762b90db..c96002e19 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go @@ -21,7 +21,10 @@ import ( // TestExtentSize tests that the extent structs are of the correct // size. func TestExtentSize(t *testing.T) { - assertSize(t, ExtentHeader{}, ExtentHeaderSize) - assertSize(t, ExtentIdx{}, ExtentEntrySize) - assertSize(t, Extent{}, ExtentEntrySize) + var h ExtentHeader + assertSize(t, &h, ExtentHeaderSize) + var i ExtentIdx + assertSize(t, &i, ExtentEntrySize) + var e Extent + assertSize(t, &e, ExtentEntrySize) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go index 88ae913f5..ef25040a9 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go @@ -16,6 +16,7 @@ package disklayout import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) @@ -38,6 +39,8 @@ const ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes. type Inode interface { + marshal.Marshallable + // Mode returns the linux file mode which is majorly used to extract // information like: // - File permissions (read/write/execute by user/group/others). diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go index 8f9f574ce..a4503f5cf 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go @@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time" // are used to provide nanoscond precision. Hence, these timestamps will now // overflow in May 2446. // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps. +// +// +marshal type InodeNew struct { InodeOld diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go index db25b11b6..e6b28babf 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go @@ -30,6 +30,8 @@ const ( // // All fields representing time are in seconds since the epoch. Which means that // they will overflow in January 2038. +// +// +marshal type InodeOld struct { ModeRaw uint16 UIDLo uint16 diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go index dd03ee50e..90744e956 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go @@ -24,10 +24,12 @@ import ( // TestInodeSize tests that the inode structs are of the correct size. func TestInodeSize(t *testing.T) { - assertSize(t, InodeOld{}, OldInodeSize) + var iOld InodeOld + assertSize(t, &iOld, OldInodeSize) // This was updated from 156 bytes to 160 bytes in Oct 2015. - assertSize(t, InodeNew{}, 160) + var iNew InodeNew + assertSize(t, &iNew, 160) } // TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go index 8bb327006..70948ebe9 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go @@ -14,6 +14,10 @@ package disklayout +import ( + "gvisor.dev/gvisor/pkg/marshal" +) + const ( // SbOffset is the absolute offset at which the superblock is placed. SbOffset = 1024 @@ -38,6 +42,8 @@ const ( // // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block. type SuperBlock interface { + marshal.Marshallable + // InodesCount returns the total number of inodes in this filesystem. InodesCount() uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go index 53e515fd3..4dc6080fb 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go @@ -17,6 +17,8 @@ package disklayout // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of // the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if // RevLevel = DynamicRev and 64-bit feature is disabled. +// +// +marshal type SuperBlock32Bit struct { // We embed the old superblock struct here because the 32-bit version is just // an extension of the old version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go index 7c1053fb4..2c9039327 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go @@ -19,6 +19,8 @@ package disklayout // 1024 bytes (smallest possible block size) and hence the superblock always // fits in no more than one data block. Should only be used when the 64-bit // feature is set. +// +// +marshal type SuperBlock64Bit struct { // We embed the 32-bit struct here because 64-bit version is just an extension // of the 32-bit version. diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go index 9221e0251..e4709f23c 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go @@ -16,6 +16,8 @@ package disklayout // SuperBlockOld implements SuperBlock and represents the old version of the // superblock struct. Should be used only if RevLevel = OldRev. +// +// +marshal type SuperBlockOld struct { InodesCountRaw uint32 BlocksCountLo uint32 diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go index 463b5ba21..b734b6987 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go @@ -21,7 +21,10 @@ import ( // TestSuperBlockSize tests that the superblock structs are of the correct // size. func TestSuperBlockSize(t *testing.T) { - assertSize(t, SuperBlockOld{}, 84) - assertSize(t, SuperBlock32Bit{}, 336) - assertSize(t, SuperBlock64Bit{}, 1024) + var sbOld SuperBlockOld + assertSize(t, &sbOld, 84) + var sb32 SuperBlock32Bit + assertSize(t, &sb32, 336) + var sb64 SuperBlock64Bit + assertSize(t, &sb64, 1024) } diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go index 9c63f04c0..a4bc08411 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go +++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go @@ -18,13 +18,13 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" ) -func assertSize(t *testing.T, v interface{}, want uintptr) { +func assertSize(t *testing.T, v marshal.Marshallable, want int) { t.Helper() - if got := binary.Size(v); got != want { + if got := v.SizeBytes(); got != want { t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got) } } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index dac6effbf..38fb7962b 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -34,11 +34,10 @@ import ( const Name = "ext" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} -// Compiles only if FilesystemType implements vfs.FilesystemType. -var _ vfs.FilesystemType = (*FilesystemType)(nil) - // getDeviceFd returns an io.ReaderAt to the underlying device. // Currently there are two ways of mounting an ext(2/3/4) fs: // 1. Specify a mount with our internal special MountType in the OCI spec. @@ -99,6 +98,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // TODO(b/134676337): Ensure that the user is mounting readonly. If not, @@ -123,32 +125,32 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 64e9a579f..d9fd4590c 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,22 +65,27 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }) if err != nil { f.Close() return nil, nil, nil, nil, err } root := mntns.Root() + root.IncRef() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { t.Fatalf("tearDown failed: %v", err) diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index 11dcc0346..778460107 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -18,12 +18,13 @@ import ( "io" "sort" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // extentFile is a type of regular file which uses extents to store file data. +// +// +stateify savable type extentFile struct { regFile regularFile @@ -38,9 +39,10 @@ var _ io.ReaderAt = (*extentFile)(nil) // newExtentFile is the extent file constructor. It reads the entire extent // tree into memory. // TODO(b/134676337): Build extent tree on demand to reduce memory usage. -func newExtentFile(regFile regularFile) (*extentFile, error) { - file := &extentFile{regFile: regFile} +func newExtentFile(args inodeArgs) (*extentFile, error) { + file := &extentFile{} file.regFile.impl = file + file.regFile.inode.init(args, &file.regFile) err := file.buildExtTree() if err != nil { return nil, err @@ -57,7 +59,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) { func (f *extentFile) buildExtTree() error { rootNodeData := f.regFile.inode.diskInode.Data() - binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header) + f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize]) // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. if f.root.Header.NumEntries > 4 { @@ -76,7 +78,7 @@ func (f *extentFile) buildExtTree() error { // Internal node. curEntry = &disklayout.ExtentIdx{} } - binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry) + curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize]) f.root.Entries[i].Entry = curEntry } diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go index a2382daa3..985f76ac0 100644 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -21,7 +21,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" ) @@ -177,21 +176,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] t.Helper() mockDisk := make([]byte, mockExtentBlkSize*10) - mockExtentFile := &extentFile{ - regFile: regularFile{ - inode: inode{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), - }, - }, - blkSize: mockExtentBlkSize, + mockExtentFile := &extentFile{} + args := inodeArgs{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), }, }, + blkSize: mockExtentBlkSize, } + mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile) fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize) @@ -204,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] // writeTree writes the tree represented by `root` to the inode and disk. It // also writes random file data on disk. func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte { - rootData := binary.Marshal(nil, binary.LittleEndian, root.Header) + rootData := in.diskInode.Data() + root.Header.MarshalBytes(rootData) + off := root.Header.SizeBytes() for _, ep := range root.Entries { - rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry) + ep.Entry.MarshalBytes(rootData[off:]) + off += ep.Entry.SizeBytes() } - copy(in.diskInode.Data(), rootData) - var fileData []byte for _, ep := range root.Entries { if root.Header.Height == 0 { @@ -225,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl // writeTreeToDisk is the recursive step for writeTree which writes the tree // on the disk only. Also writes random file data on disk. func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte { - nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header) + nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:] + curNode.Node.Header.MarshalBytes(nodeData) + off := curNode.Node.Header.SizeBytes() for _, ep := range curNode.Node.Entries { - nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry) + ep.Entry.MarshalBytes(nodeData[off:]) + off += ep.Entry.SizeBytes() } - copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData) - var fileData []byte for _, ep := range curNode.Node.Entries { if curNode.Node.Header.Height == 0 { diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 92f7da40d..90b086468 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -26,6 +26,7 @@ import ( type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 557963e03..917f1873d 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -38,11 +38,13 @@ var ( ) // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem // mu serializes changes to the Dentry tree. - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` // dev represents the underlying fs device. It does not require protection // because io.ReaderAt permits concurrent read calls to it. It translates to @@ -81,10 +83,10 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil) // stepLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: -// - filesystem.mu must be locked (for writing if write param is true). -// - !rp.Done(). -// - inode == vfsd.Impl().(*Dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { +// * filesystem.mu must be locked (for writing if write param is true). +// * !rp.Done(). +// * inode == vfsd.Impl().(*Dentry).inode. +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { if !inode.isDir() { return nil, nil, syserror.ENOTDIR } @@ -100,7 +102,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } d := vfsd.Impl().(*dentry) if name == ".." { - isRoot, err := rp.CheckRoot(vfsd) + isRoot, err := rp.CheckRoot(ctx, vfsd) if err != nil { return nil, nil, err } @@ -108,7 +110,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo rp.Advance() return vfsd, inode, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, nil, err } rp.Advance() @@ -143,7 +145,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo child.name = name dir.childCache[name] = child } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, nil, err } if child.inode.isSymlink() && rp.ShouldFollowSymlink() { @@ -166,13 +168,13 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: -// - filesystem.mu must be locked (for writing if write param is true). -func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +// * filesystem.mu must be locked (for writing if write param is true). +func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Done() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -194,14 +196,14 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). // // Preconditions: -// - filesystem.mu must be locked (for writing if write param is true). -// - !rp.Done(). -func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +// * filesystem.mu must be locked (for writing if write param is true). +// * !rp.Done(). +func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Final() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -216,7 +218,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e // the rp till the parent of the last component which should be an existing // directory. If parent is false then resolves rp entirely. Attemps to resolve // the path as far as it can with a read lock and upgrades the lock if needed. -func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { +func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { var ( vfsd *vfs.Dentry inode *inode @@ -227,9 +229,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // of disk. This reduces congestion (allows concurrent walks). fs.mu.RLock() if parent { - vfsd, inode, err = walkParentLocked(rp, false) + vfsd, inode, err = walkParentLocked(ctx, rp, false) } else { - vfsd, inode, err = walkLocked(rp, false) + vfsd, inode, err = walkLocked(ctx, rp, false) } fs.mu.RUnlock() @@ -238,9 +240,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // walk is fine as this is a read only filesystem. fs.mu.Lock() if parent { - vfsd, inode, err = walkParentLocked(rp, true) + vfsd, inode, err = walkParentLocked(ctx, rp, true) } else { - vfsd, inode, err = walkLocked(rp, true) + vfsd, inode, err = walkLocked(ctx, rp, true) } fs.mu.Unlock() } @@ -283,7 +285,7 @@ func (fs *filesystem) statTo(stat *linux.Statfs) { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -292,7 +294,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -312,7 +314,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, true) + vfsd, inode, err := fs.walk(ctx, rp, true) if err != nil { return nil, err } @@ -322,7 +324,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -336,7 +338,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -349,7 +351,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return linux.Statx{}, err } @@ -360,7 +362,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, _, err := fs.walk(rp, false); err != nil { + if _, _, err := fs.walk(ctx, rp, false); err != nil { return linux.Statfs{}, err } @@ -370,7 +372,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -390,7 +392,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -403,7 +405,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -416,7 +418,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -430,7 +432,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -440,7 +442,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -454,7 +456,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -468,7 +470,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -478,7 +480,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -490,9 +492,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -504,36 +506,36 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - _, _, err := fs.walk(rp, false) +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + _, _, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } return nil, syserror.ENOTSUP } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { - _, _, err := fs.walk(rp, false) +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { + _, _, err := fs.walk(ctx, rp, false) if err != nil { return "", err } return "", syserror.ENOTSUP } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { - _, _, err := fs.walk(rp, false) +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } return syserror.ENOTSUP } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - _, _, err := fs.walk(rp, false) +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 485f86f4b..9009ba3c7 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -37,6 +37,8 @@ import ( // |-- regular-- // |-- extent file // |-- block map file +// +// +stateify savable type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 @@ -54,6 +56,8 @@ type inode struct { // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode + locks vfs.FileLocks + // This is immutable. The first field of the implementations must have inode // as the first field to ensure temporality. impl interface{} @@ -115,7 +119,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } // Build the inode based on its type. - inode := inode{ + args := inodeArgs{ fs: fs, inodeNum: inodeNum, blkSize: blkSize, @@ -124,19 +128,19 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { switch diskInode.Mode().FileType() { case linux.ModeSymlink: - f, err := newSymlink(inode) + f, err := newSymlink(args) if err != nil { return nil, err } return &f.inode, nil case linux.ModeRegular: - f, err := newRegularFile(inode) + f, err := newRegularFile(args) if err != nil { return nil, err } return &f.inode, nil case linux.ModeDirectory: - f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType) + f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType) if err != nil { return nil, err } @@ -147,6 +151,21 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } } +type inodeArgs struct { + fs *filesystem + inodeNum uint32 + blkSize uint64 + diskInode disklayout.Inode +} + +func (in *inode) init(args inodeArgs, impl interface{}) { + in.fs = args.fs + in.inodeNum = args.inodeNum + in.blkSize = args.blkSize + in.diskInode = args.diskInode + in.impl = impl +} + // open creates and returns a file description for the dentry passed in. func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) @@ -157,6 +176,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt switch in.impl.(type) { case *regularFile: var fd regularFileFD + fd.LockFD.Init(&in.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -168,6 +188,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt return nil, syserror.EISDIR } var fd directoryFD + fd.LockFD.Init(&in.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -178,6 +199,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt return nil, syserror.ELOOP } var fd symlinkFD + fd.LockFD.Init(&in.locks) fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 30135ddb0..4a5539b37 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -30,6 +31,8 @@ import ( // regularFile represents a regular file's inode. This too follows the // inheritance pattern prevelant in the vfs layer described in // pkg/sentry/vfs/README.md. +// +// +stateify savable type regularFile struct { inode inode @@ -43,28 +46,19 @@ type regularFile struct { // newRegularFile is the regularFile constructor. It figures out what kind of // file this is and initializes the fileReader. -func newRegularFile(inode inode) (*regularFile, error) { - regFile := regularFile{ - inode: inode, - } - - inodeFlags := inode.diskInode.Flags() - - if inodeFlags.Extents { - file, err := newExtentFile(regFile) +func newRegularFile(args inodeArgs) (*regularFile, error) { + if args.diskInode.Flags().Extents { + file, err := newExtentFile(args) if err != nil { return nil, err } - - file.regFile.inode.impl = &file.regFile return &file.regFile, nil } - file, err := newBlockMapFile(regFile) + file, err := newBlockMapFile(args) if err != nil { return nil, err } - file.regFile.inode.impl = &file.regFile return &file.regFile, nil } @@ -75,18 +69,21 @@ func (in *inode) isRegular() bool { // directoryFD represents a directory file description. It implements // vfs.FileDescriptionImpl. +// +// +stateify savable type regularFileFD struct { fileDescription + vfs.LockFD // off is the file offset. off is accessed using atomic memory operations. off int64 // offMu serializes operations that may mutate off. - offMu sync.Mutex + offMu sync.Mutex `state:"nosave"` } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() {} +func (fd *regularFileFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { @@ -157,3 +154,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // TODO(b/134676337): Implement mmap(2). return syserror.ENODEV } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 1447a4dc1..5e2bcc837 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -23,6 +23,8 @@ import ( ) // symlink represents a symlink inode. +// +// +stateify savable type symlink struct { inode inode target string // immutable @@ -30,18 +32,17 @@ type symlink struct { // newSymlink is the symlink constructor. It reads out the symlink target from // the inode (however it might have been stored). -func newSymlink(inode inode) (*symlink, error) { - var file *symlink +func newSymlink(args inodeArgs) (*symlink, error) { var link []byte // If the symlink target is lesser than 60 bytes, its stores in inode.Data(). // Otherwise either extents or block maps will be used to store the link. - size := inode.diskInode.Size() + size := args.diskInode.Size() if size < 60 { - link = inode.diskInode.Data()[:size] + link = args.diskInode.Data()[:size] } else { // Create a regular file out of this inode and read out the target. - regFile, err := newRegularFile(inode) + regFile, err := newRegularFile(args) if err != nil { return nil, err } @@ -52,8 +53,8 @@ func newSymlink(inode inode) (*symlink, error) { } } - file = &symlink{inode: inode, target: string(link)} - file.inode.impl = file + file := &symlink{target: string(link)} + file.inode.init(args, file) return file, nil } @@ -62,18 +63,21 @@ func (in *inode) isSymlink() bool { return ok } -// symlinkFD represents a symlink file description and implements implements +// symlinkFD represents a symlink file description and implements // vfs.FileDescriptionImpl. which may only be used if open options contains // O_PATH. For this reason most of the functions return EBADF. +// +// +stateify savable type symlinkFD struct { fileDescription + vfs.NoLockFD } // Compiles only if symlinkFD implements vfs.FileDescriptionImpl. var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *symlinkFD) Release() {} +func (fd *symlinkFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go index d8b728f8c..58ef7b9b8 100644 --- a/pkg/sentry/fsimpl/ext/utils.go +++ b/pkg/sentry/fsimpl/ext/utils.go @@ -17,21 +17,21 @@ package ext import ( "io" - "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/syserror" ) // readFromDisk performs a binary read from disk into the given struct from // the absolute offset provided. -func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error { - n := binary.Size(v) +func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error { + n := v.SizeBytes() buf := make([]byte, n) if read, _ := dev.ReadAt(buf, abOff); read < int(n) { return syserror.EIO } - binary.Unmarshal(buf, binary.LittleEndian, v) + v.UnmarshalBytes(buf) return nil } diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD new file mode 100644 index 000000000..2158b1bbc --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -0,0 +1,87 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "request_list", + out = "request_list.go", + package = "fuse", + prefix = "request", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Request", + "Linker": "*Request", + }, +) + +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "fuse", + prefix = "inode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "inode", + }, +) + +go_library( + name = "fuse", + srcs = [ + "connection.go", + "connection_control.go", + "dev.go", + "directory.go", + "file.go", + "fusefs.go", + "inode_refs.go", + "read_write.go", + "register.go", + "regular_file.go", + "request_list.go", + "request_response.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/marshal", + "//pkg/refs", + "//pkg/refsvfs2", + "//pkg/safemem", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "fuse_test", + size = "small", + srcs = [ + "connection_test.go", + "dev_test.go", + "utils_test.go", + ], + library = ":fuse", + deps = [ + "//pkg/abi/linux", + "//pkg/marshal", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go new file mode 100644 index 000000000..8ccda1264 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -0,0 +1,322 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + // fuseDefaultMaxBackground is the default value for MaxBackground. + fuseDefaultMaxBackground = 12 + + // fuseDefaultCongestionThreshold is the default value for CongestionThreshold, + // and is 75% of the default maximum of MaxGround. + fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4) + + // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq. + fuseDefaultMaxPagesPerReq = 32 +) + +// connection is the struct by which the sentry communicates with the FUSE server daemon. +// +// Lock order: +// - conn.fd.mu +// - conn.mu +// - conn.asyncMu +// +// +stateify savable +type connection struct { + fd *DeviceFD + + // mu protects access to struct memebers. + mu sync.Mutex `state:"nosave"` + + // attributeVersion is the version of connection's attributes. + attributeVersion uint64 + + // We target FUSE 7.23. + // The following FUSE_INIT flags are currently unsupported by this implementation: + // - FUSE_EXPORT_SUPPORT + // - FUSE_POSIX_LOCKS: requires POSIX locks + // - FUSE_FLOCK_LOCKS: requires POSIX locks + // - FUSE_AUTO_INVAL_DATA: requires page caching eviction + // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation + // - FUSE_ASYNC_DIO + // - FUSE_PARALLEL_DIROPS (7.25) + // - FUSE_HANDLE_KILLPRIV (7.26) + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26) + // - FUSE_ABORT_ERROR (7.27) + // - FUSE_CACHE_SYMLINKS (7.28) + // - FUSE_NO_OPENDIR_SUPPORT (7.29) + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30) + // - FUSE_MAP_ALIGNMENT (7.31) + + // initialized after receiving FUSE_INIT reply. + // Until it's set, suspend sending FUSE requests. + // Use SetInitialized() and IsInitialized() for atomic access. + initialized int32 + + // initializedChan is used to block requests before initialization. + initializedChan chan struct{} `state:".(bool)"` + + // connected (connection established) when a new FUSE file system is created. + // Set to false when: + // umount, + // connection abort, + // device release. + connected bool + + // connInitError if FUSE_INIT encountered error (major version mismatch). + // Only set in INIT. + connInitError bool + + // connInitSuccess if FUSE_INIT is successful. + // Only set in INIT. + // Used for destory (not yet implemented). + connInitSuccess bool + + // aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV). + // Set only if abortErr is true and via fuse control fs (not yet implemented). + // TODO(gvisor.dev/issue/3525): set this to true when user aborts. + aborted bool + + // numWating is the number of requests waiting to be + // sent to FUSE device or being processed by FUSE daemon. + numWaiting uint32 + + // Terminology note: + // + // - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct. + // + // - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct. + // + // We call the "background" requests in unix term as async requests. + // The "async requests" in unix term is our async requests that expect a reply, + // i.e. `!request.noReply` + + // asyncMu protects the async request fields. + asyncMu sync.Mutex `state:"nosave"` + + // asyncNum is the number of async requests. + // Protected by asyncMu. + asyncNum uint16 + + // asyncCongestionThreshold the number of async requests. + // Negotiated in FUSE_INIT as "CongestionThreshold". + // TODO(gvisor.dev/issue/3529): add congestion control. + // Protected by asyncMu. + asyncCongestionThreshold uint16 + + // asyncNumMax is the maximum number of asyncNum. + // Connection blocks the async requests when it is reached. + // Negotiated in FUSE_INIT as "MaxBackground". + // Protected by asyncMu. + asyncNumMax uint16 + + // maxRead is the maximum size of a read buffer in in bytes. + // Initialized from a fuse fs parameter. + maxRead uint32 + + // maxWrite is the maximum size of a write buffer in bytes. + // Negotiated in FUSE_INIT. + maxWrite uint32 + + // maxPages is the maximum number of pages for a single request to use. + // Negotiated in FUSE_INIT. + maxPages uint16 + + // minor version of the FUSE protocol. + // Negotiated and only set in INIT. + minor uint32 + + // atomicOTrunc is true when FUSE does not send a separate SETATTR request + // before open with O_TRUNC flag. + // Negotiated and only set in INIT. + atomicOTrunc bool + + // asyncRead if read pages asynchronously. + // Negotiated and only set in INIT. + asyncRead bool + + // writebackCache is true for write-back cache policy, + // false for write-through policy. + // Negotiated and only set in INIT. + writebackCache bool + + // bigWrites if doing multi-page cached writes. + // Negotiated and only set in INIT. + bigWrites bool + + // dontMask if filestestem does not apply umask to creation modes. + // Negotiated in INIT. + dontMask bool + + // noOpen if FUSE server doesn't support open operation. + // This flag only influence performance, not correctness of the program. + noOpen bool +} + +func (conn *connection) saveInitializedChan() bool { + select { + case <-conn.initializedChan: + return true // Closed. + default: + return false // Not closed. + } +} + +func (conn *connection) loadInitializedChan(closed bool) { + conn.initializedChan = make(chan struct{}, 1) + if closed { + close(conn.initializedChan) + } +} + +// newFUSEConnection creates a FUSE connection to fd. +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) { + // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to + // mount a FUSE filesystem. + fuseFD := fd.Impl().(*DeviceFD) + + // Create the writeBuf for the header to be stored in. + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + fuseFD.writeBuf = make([]byte, hdrLen) + fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) + fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests) + fuseFD.writeCursor = 0 + + return &connection{ + fd: fuseFD, + asyncNumMax: fuseDefaultMaxBackground, + asyncCongestionThreshold: fuseDefaultCongestionThreshold, + maxRead: opts.maxRead, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, + }, nil +} + +// CallAsync makes an async (aka background) request. +// It's a simple wrapper around Call(). +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + r.async = true + _, err := conn.Call(t, r) + return err +} + +// Call makes a request to the server. +// Block before the connection is initialized. +// When the Request is FUSE_INIT, it will not be blocked before initialization. +// Task should never be nil. +// +// For a sync request, it blocks the invoking task until +// a server responds with a response. +// +// For an async request (that do not expect a response immediately), +// it returns directly unless being blocked either before initialization +// or when there are too many async requests ongoing. +// +// Example for async request: +// init, readahead, write, async read/write, fuse_notify_reply, +// non-sync release, interrupt, forget. +// +// The forget request does not have a reply, +// as documented in include/uapi/linux/fuse.h:FUSE_FORGET. +func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { + // Block requests sent before connection is initalized. + if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { + if err := t.Block(conn.initializedChan); err != nil { + return nil, err + } + } + + if !conn.connected { + return nil, syserror.ENOTCONN + } + + if conn.connInitError { + return nil, syserror.ECONNREFUSED + } + + fut, err := conn.callFuture(t, r) + if err != nil { + return nil, err + } + + return fut.resolve(t) +} + +// callFuture makes a request to the server and returns a future response. +// Call resolve() when the response needs to be fulfilled. +func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + + // Is the queue full? + // + // We must busy wait here until the request can be queued. We don't + // block on the fd.fullQueueCh with a lock - so after being signalled, + // before we acquire the lock, it is possible that a barging task enters + // and queues a request. As a result, upon acquiring the lock we must + // again check if the room is available. + // + // This can potentially starve a request forever but this can only happen + // if there are always too many ongoing requests all the time. The + // supported maxActiveRequests setting should be really high to avoid this. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + log.Infof("Blocking request %v from being queued. Too many active requests: %v", + r.id, conn.fd.numActiveRequests) + conn.fd.mu.Unlock() + err := t.Block(conn.fd.fullQueueCh) + conn.fd.mu.Lock() + if err != nil { + return nil, err + } + } + + return conn.callFutureLocked(t, r) +} + +// callFutureLocked makes a request to the server and returns a future response. +func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { + // Check connected again holding conn.mu. + conn.mu.Lock() + if !conn.connected { + conn.mu.Unlock() + // we checked connected before, + // this must be due to aborted connection. + return nil, syserror.ECONNABORTED + } + conn.mu.Unlock() + + conn.fd.queue.PushBack(r) + conn.fd.numActiveRequests++ + fut := newFutureResponse(r) + conn.fd.completions[r.id] = fut + + // Signal the readers that there is something to read. + conn.fd.waitQueue.Notify(waiter.EventIn) + + return fut, nil +} diff --git a/pkg/sentry/fsimpl/fuse/connection_control.go b/pkg/sentry/fsimpl/fuse/connection_control.go new file mode 100644 index 000000000..bfde78559 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection_control.go @@ -0,0 +1,247 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// consts used by FUSE_INIT negotiation. +const ( + // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut. + // Follow the same behavior as unix fuse implementation. + fuseMaxMaxPages = 256 + + // Maximum value for the time granularity for file time stamps, 1s. + // Follow the same behavior as unix fuse implementation. + fuseMaxTimeGranNs = 1000000000 + + // Minimum value for MaxWrite and MaxRead. + // Follow the same behavior as unix fuse implementation. + fuseMinMaxWrite = 4096 + fuseMinMaxRead = 4096 + + // Temporary default value for max readahead, 128kb. + fuseDefaultMaxReadahead = 131072 + + // The FUSE_INIT_IN flags sent to the daemon. + // TODO(gvisor.dev/issue/3199): complete the flags. + fuseDefaultInitFlags = linux.FUSE_MAX_PAGES +) + +// Adjustable maximums for Connection's cogestion control parameters. +// Used as the upperbound of the config values. +// Currently we do not support adjustment to them. +var ( + MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground + MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold +) + +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) +} + +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + +// InitSend sends a FUSE_INIT request. +func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { + in := linux.FUSEInitIn{ + Major: linux.FUSE_KERNEL_VERSION, + Minor: linux.FUSE_KERNEL_MINOR_VERSION, + // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this + MaxReadahead: fuseDefaultMaxReadahead, + Flags: fuseDefaultInitFlags, + } + + req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) + if err != nil { + return err + } + + // Since there is no task to block on and FUSE_INIT is the request + // to unblock other requests, use nil. + return conn.CallAsync(nil, req) +} + +// InitRecv receives a FUSE_INIT reply and process it. +// +// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13. +func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { + if err := res.Error(); err != nil { + return err + } + + initRes := fuseInitRes{initLen: res.DataLen()} + if err := res.UnmarshalPayload(&initRes); err != nil { + return err + } + + return conn.initProcessReply(&initRes.initOut, hasSysAdminCap) +} + +// Process the FUSE_INIT reply from the FUSE server. +// It tries to acquire the conn.asyncMu lock if minor version is newer than 13. +func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { + // No matter error or not, always set initialzied. + // to unblock the blocked requests. + defer conn.SetInitialized() + + // No support for old major fuse versions. + if out.Major != linux.FUSE_KERNEL_VERSION { + conn.connInitError = true + return nil + } + + // Start processing the reply. + conn.connInitSuccess = true + conn.minor = out.Minor + + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite + } + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite + } + + // No support for the following flags before minor version 6. + if out.Minor >= 6 { + conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0 + conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 + conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 + conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 + + // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). + + if out.Flags&linux.FUSE_MAX_PAGES != 0 { + maxPages := out.MaxPages + if maxPages < 1 { + maxPages = 1 + } + if maxPages > fuseMaxMaxPages { + maxPages = fuseMaxMaxPages + } + conn.maxPages = maxPages + } + } + + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.asyncMu.Lock() + + if out.MaxBackground > 0 { + conn.asyncNumMax = out.MaxBackground + + if !hasSysAdminCap && + conn.asyncNumMax > MaxUserBackgroundRequest { + conn.asyncNumMax = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.asyncCongestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.asyncCongestionThreshold > MaxUserCongestionThreshold { + conn.asyncCongestionThreshold = MaxUserCongestionThreshold + } + } + + conn.asyncMu.Unlock() + } + + return nil +} + +// Abort this FUSE connection. +// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order. +// All possible requests waiting or blocking will be aborted. +// +// Preconditions: conn.fd.mu is locked. +func (conn *connection) Abort(ctx context.Context) { + conn.mu.Lock() + conn.asyncMu.Lock() + + if !conn.connected { + conn.asyncMu.Unlock() + conn.mu.Unlock() + conn.fd.mu.Unlock() + return + } + + conn.connected = false + + // Empty the `fd.queue` that holds the requests + // not yet read by the FUSE daemon yet. + // These are a subset of the requests in `fuse.completion` map. + for !conn.fd.queue.Empty() { + req := conn.fd.queue.Front() + conn.fd.queue.Remove(req) + } + + var terminate []linux.FUSEOpID + + // 2. Collect the requests have not been sent to FUSE daemon, + // or have not received a reply. + for unique := range conn.fd.completions { + terminate = append(terminate, unique) + } + + // Release locks to avoid deadlock. + conn.asyncMu.Unlock() + conn.mu.Unlock() + + // 1. The requets blocked before initialization. + // Will reach call() `connected` check and return. + if !conn.Initialized() { + conn.SetInitialized() + } + + // 2. Terminate the requests collected above. + // Set ECONNABORTED error. + // sendError() will remove them from `fd.completion` map. + // Will enter the path of a normally received error. + for _, toTerminate := range terminate { + conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate) + } + + // 3. The requests not yet written to FUSE device. + // Early terminate. + // Will reach callFutureLocked() `connected` check and return. + close(conn.fd.fullQueueCh) + + // TODO(gvisor.dev/issue/3528): Forget all pending forget reqs. +} diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go new file mode 100644 index 000000000..91d16c1cf --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection_test.go @@ -0,0 +1,117 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "math/rand" + "syscall" + "testing" + + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TestConnectionInitBlock tests if initialization +// correctly blocks and unblocks the connection. +// Since it's unfeasible to test kernelTask.Block() in unit test, +// the code in Call() are not tested here. +func TestConnectionInitBlock(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + + conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + select { + case <-conn.initializedChan: + t.Fatalf("initializedChan should be blocking before SetInitialized") + default: + } + + conn.SetInitialized() + + select { + case <-conn.initializedChan: + default: + t.Fatalf("initializedChan should not be blocking after SetInitialized") + } +} + +func TestConnectionAbort(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + creds := auth.CredentialsFromContext(s.Ctx) + task := kernel.TaskFromContext(s.Ctx) + + const numRequests uint64 = 256 + + conn, _, err := newTestConnection(s, k, numRequests) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + testObj := &testPayload{ + data: rand.Uint32(), + } + + var futNormal []*futureResponse + + for i := 0; i < int(numRequests); i++ { + req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + fut, err := conn.callFutureLocked(task, req) + if err != nil { + t.Fatalf("callFutureLocked failed: %v", err) + } + futNormal = append(futNormal, fut) + } + + conn.Abort(s.Ctx) + + // Abort should unblock the initialization channel. + // Note: no test requests are actually blocked on `conn.initializedChan`. + select { + case <-conn.initializedChan: + default: + t.Fatalf("initializedChan should not be blocking after SetInitialized") + } + + // Abort will return ECONNABORTED error to unblocked requests. + for _, fut := range futNormal { + if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) { + t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error) + } + } + + // After abort, Call() should return directly with ENOTCONN. + req, err := conn.NewRequest(creds, 0, 0, 0, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + _, err = conn.Call(task, req) + if err != syserror.ENOTCONN { + t.Fatalf("Incorrect error code received for Call() after connection aborted") + } + +} diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go new file mode 100644 index 000000000..1b86a4b4c --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -0,0 +1,463 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const fuseDevMinor = 229 + +// fuseDevice implements vfs.Device for /dev/fuse. +// +// +stateify savable +type fuseDevice struct{} + +// Open implements vfs.Device.Open. +func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + if !kernel.FUSEEnabled { + return nil, syserror.ENOENT + } + + var fd DeviceFD + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse. +// +// +stateify savable +type DeviceFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // nextOpID is used to create new requests. + nextOpID linux.FUSEOpID + + // queue is the list of requests that need to be processed by the FUSE server. + queue requestList + + // numActiveRequests is the number of requests made by the Sentry that has + // yet to be responded to. + numActiveRequests uint64 + + // completions is used to map a request to its response. A Writer will use this + // to notify the caller of a completed response. + completions map[linux.FUSEOpID]*futureResponse + + writeCursor uint32 + + // writeBuf is the memory buffer used to copy in the FUSE out header from + // userspace. + writeBuf []byte + + // writeCursorFR current FR being copied from server. + writeCursorFR *futureResponse + + // mu protects all the queues, maps, buffers and cursors and nextOpID. + mu sync.Mutex `state:"nosave"` + + // waitQueue is used to notify interested parties when the device becomes + // readable or writable. + waitQueue waiter.Queue + + // fullQueueCh is a channel used to synchronize the readers with the writers. + // Writers (inbound requests to the filesystem) block if there are too many + // unprocessed in-flight requests. + fullQueueCh chan struct{} `state:".(int)"` + + // fs is the FUSE filesystem that this FD is being used for. + fs *filesystem +} + +func (fd *DeviceFD) saveFullQueueCh() int { + return cap(fd.fullQueueCh) +} + +func (fd *DeviceFD) loadFullQueueCh(capacity int) { + fd.fullQueueCh = make(chan struct{}, capacity) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DeviceFD) Release(ctx context.Context) { + if fd.fs != nil { + fd.fs.conn.mu.Lock() + fd.fs.conn.connected = false + fd.fs.conn.mu.Unlock() + + fd.fs.VFSFilesystem().DecRef(ctx) + fd.fs = nil + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if fd.fs == nil { + return 0, syserror.EPERM + } + + return 0, syserror.ENOSYS +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if fd.fs == nil { + return 0, syserror.EPERM + } + + // Return ENODEV if the filesystem is umounted. + if fd.fs.umounted { + // TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs. + return 0, syserror.ENODEV + } + + // We require that any Read done on this filesystem have a sane minimum + // read buffer. It must have the capacity for the fixed parts of any request + // header (Linux uses the request header and the FUSEWriteIn header for this + // calculation) + the negotiated MaxWrite room for the data. + minBuffSize := linux.FUSE_MIN_READ_BUFFER + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite + if minBuffSize < negotiatedMinBuffSize { + minBuffSize = negotiatedMinBuffSize + } + + // If the read buffer is too small, error out. + if dst.NumBytes() < int64(minBuffSize) { + return 0, syserror.EINVAL + } + + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readLocked(ctx, dst, opts) +} + +// readLocked implements the reading of the fuse device while locked with DeviceFD.mu. +// +// Preconditions: dst is large enough for any reasonable request. +func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + var req *Request + + // Find the first valid request. + // For the normal case this loop only execute once. + for !fd.queue.Empty() { + req = fd.queue.Front() + + if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() { + break + } + + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } + + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil { + return 0, err + } + + // We're done with this request. + fd.queue.Remove(req) + req = nil + } + + if req == nil { + return 0, syserror.ErrWouldBlock + } + + // We already checked the size: dst must be able to fit the whole request. + // Now we write the marshalled header, the payload, + // and the potential additional payload + // to the user memory IOSequence. + + n, err := dst.CopyOut(ctx, req.data) + if err != nil { + return 0, err + } + if n != len(req.data) { + return 0, syserror.EIO + } + + if req.hdr.Opcode == linux.FUSE_WRITE { + written, err := dst.DropFirst(n).CopyOut(ctx, req.payload) + if err != nil { + return 0, err + } + if written != len(req.payload) { + return 0, syserror.EIO + } + n += int(written) + } + + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + + // Remove noReply ones from map of requests expecting a reply. + if req.noReply { + fd.numActiveRequests -= 1 + delete(fd.completions, req.hdr.Unique) + } + + return int64(n), nil +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if fd.fs == nil { + return 0, syserror.EPERM + } + + return 0, syserror.ENOSYS +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.writeLocked(ctx, src, opts) +} + +// writeLocked implements writing to the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if fd.fs == nil { + return 0, syserror.EPERM + } + + // Return ENODEV if the filesystem is umounted. + if fd.fs.umounted { + return 0, syserror.ENODEV + } + + var cn, n int64 + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + + for src.NumBytes() > 0 { + if fd.writeCursorFR != nil { + // Already have common header, and we're now copying the payload. + wantBytes := fd.writeCursorFR.hdr.Len + + // Note that the FR data doesn't have the header. Copy it over if its necessary. + if fd.writeCursorFR.data == nil { + fd.writeCursorFR.data = make([]byte, wantBytes) + } + + bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == wantBytes { + // Done reading this full response. Clean up and unblock the + // initiator. + break + } + + // Check if we have more data in src. + continue + } + + // Assert that the header isn't read into the writeBuf yet. + if fd.writeCursor >= hdrLen { + return 0, syserror.EINVAL + } + + // We don't have the full common response header yet. + wantBytes := hdrLen - fd.writeCursor + bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == hdrLen { + // Have full header in the writeBuf. Use it to fetch the actual futureResponse + // from the device's completions map. + var hdr linux.FUSEHeaderOut + hdr.UnmarshalBytes(fd.writeBuf) + + // We have the header now and so the writeBuf has served its purpose. + // We could reset it manually here but instead of doing that, at the + // end of the write, the writeCursor will be set to 0 thereby allowing + // the next request to overwrite whats in the buffer, + + fut, ok := fd.completions[hdr.Unique] + if !ok { + // Server sent us a response for a request we never sent, + // or for which we already received a reply (e.g. aborted), an unlikely event. + return 0, syserror.EINVAL + } + + delete(fd.completions, hdr.Unique) + + // Copy over the header into the future response. The rest of the payload + // will be copied over to the FR's data in the next iteration. + fut.hdr = &hdr + fd.writeCursorFR = fut + + // Next iteration will now try read the complete request, if src has + // any data remaining. Otherwise we're done. + } + } + + if fd.writeCursorFR != nil { + if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil { + return 0, err + } + + // Ready the device for the next request. + fd.writeCursorFR = nil + fd.writeCursor = 0 + } + + return n, nil +} + +// Readiness implements vfs.FileDescriptionImpl.Readiness. +func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readinessLocked(mask) +} + +// readinessLocked implements checking the readiness of the fuse device while +// locked with DeviceFD.mu. +func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + + if fd.fs.umounted { + ready |= waiter.EventErr + return ready & mask + } + + // FD is always writable. + ready |= waiter.EventOut + if !fd.queue.Empty() { + // Have reqs available, FD is readable. + ready |= waiter.EventIn + } + + return ready & mask +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.waitQueue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { + fd.waitQueue.EventUnregister(e) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if fd.fs == nil { + return 0, syserror.EPERM + } + + return 0, syserror.ENOSYS +} + +// sendResponse sends a response to the waiting task (if any). +// +// Preconditions: fd.mu must be held. +func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { + // Signal the task waiting on a response if any. + defer close(fut.ch) + + // Signal that the queue is no longer full. + select { + case fd.fullQueueCh <- struct{}{}: + default: + } + fd.numActiveRequests-- + + if fut.async { + return fd.asyncCallBack(ctx, fut.getResponse()) + } + + return nil +} + +// sendError sends an error response to the waiting task (if any) by calling sendResponse(). +// +// Preconditions: fd.mu must be held. +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error { + // Return the error to the calling task. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + respHdr := linux.FUSEHeaderOut{ + Len: outHdrLen, + Error: errno, + Unique: unique, + } + + fut, ok := fd.completions[respHdr.Unique] + if !ok { + // A response for a request we never sent, + // or for which we already received a reply (e.g. aborted). + return syserror.EINVAL + } + delete(fd.completions, respHdr.Unique) + + fut.hdr = &respHdr + return fd.sendResponse(ctx, fut) +} + +// asyncCallBack executes pre-defined callback function for async requests. +// Currently used by: FUSE_INIT. +func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error { + switch r.opcode { + case linux.FUSE_INIT: + creds := auth.CredentialsFromContext(ctx) + rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() + return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) + // TODO(gvisor.dev/issue/3247): support async read: correctly process the response. + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go new file mode 100644 index 000000000..5986133e9 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -0,0 +1,323 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "fmt" + "math/rand" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// echoTestOpcode is the Opcode used during testing. The server used in tests +// will simply echo the payload back with the appropriate headers. +const echoTestOpcode linux.FUSEOpcode = 1000 + +// TestFUSECommunication tests that the communication layer between the Sentry and the +// FUSE server daemon works as expected. +func TestFUSECommunication(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + creds := auth.CredentialsFromContext(s.Ctx) + + // Create test cases with different number of concurrent clients and servers. + testCases := []struct { + Name string + NumClients int + NumServers int + MaxActiveRequests uint64 + }{ + { + Name: "SingleClientSingleServer", + NumClients: 1, + NumServers: 1, + MaxActiveRequests: maxActiveRequestsDefault, + }, + { + Name: "SingleClientMultipleServers", + NumClients: 1, + NumServers: 10, + MaxActiveRequests: maxActiveRequestsDefault, + }, + { + Name: "MultipleClientsSingleServer", + NumClients: 10, + NumServers: 1, + MaxActiveRequests: maxActiveRequestsDefault, + }, + { + Name: "MultipleClientsMultipleServers", + NumClients: 10, + NumServers: 10, + MaxActiveRequests: maxActiveRequestsDefault, + }, + { + Name: "RequestCapacityFull", + NumClients: 10, + NumServers: 1, + MaxActiveRequests: 1, + }, + { + Name: "RequestCapacityContinuouslyFull", + NumClients: 100, + NumServers: 2, + MaxActiveRequests: 2, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Name, func(t *testing.T) { + conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + clientsDone := make([]chan struct{}, testCase.NumClients) + serversDone := make([]chan struct{}, testCase.NumServers) + serversKill := make([]chan struct{}, testCase.NumServers) + + // FUSE clients. + for i := 0; i < testCase.NumClients; i++ { + clientsDone[i] = make(chan struct{}) + go func(i int) { + fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i]) + }(i) + } + + // FUSE servers. + for j := 0; j < testCase.NumServers; j++ { + serversDone[j] = make(chan struct{}) + serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block. + go func(j int) { + fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j]) + }(j) + } + + // Tear down. + // + // Make sure all the clients are done. + for i := 0; i < testCase.NumClients; i++ { + <-clientsDone[i] + } + + // Kill any server that is potentially waiting. + for j := 0; j < testCase.NumServers; j++ { + serversKill[j] <- struct{}{} + } + + // Make sure all the servers are done. + for j := 0; j < testCase.NumServers; j++ { + <-serversDone[j] + } + }) + } +} + +// CallTest makes a request to the server and blocks the invoking +// goroutine until a server responds with a response. Doesn't block +// a kernel.Task. Analogous to Connection.Call but used for testing. +func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { + conn.fd.mu.Lock() + + // Wait until we're certain that a new request can be processed. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + conn.fd.mu.Unlock() + select { + case <-conn.fd.fullQueueCh: + } + conn.fd.mu.Lock() + } + + fut, err := conn.callFutureLocked(t, r) // No task given. + conn.fd.mu.Unlock() + + if err != nil { + return nil, err + } + + // Resolve the response. + // + // Block without a task. + select { + case <-fut.ch: + } + + // A response is ready. Resolve and return it. + return fut.getResponse(), nil +} + +// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE +// device. However, it does so by - not blocking the task that is calling - and +// instead just waits on a channel. The behaviour is essentially the same as +// DeviceFD.Read except it guarantees that the task is not blocked. +func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) { + var err error + var n, total int64 + + dev := fd.Impl().(*DeviceFD) + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + dev.EventRegister(&w, waiter.EventIn) + for { + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{}) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + // Emulate the blocking for when no requests are available + select { + case <-ch: + case <-killServer: + // Server killed by the main program. + return 0, true, nil + } + } + + dev.EventUnregister(&w) + return total, false, err +} + +// fuseClientRun emulates all the actions of a normal FUSE request. It creates +// a header, a payload, calls the server, waits for the response, and processes +// the response. +func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { + defer func() { clientDone <- struct{}{} }() + + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatal(err) + } + testObj := &testPayload{ + data: rand.Uint32(), + } + + req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + + // Queue up a request. + // Analogous to Call except it doesn't block on the task. + resp, err := CallTest(conn, clientTask, req, pid) + if err != nil { + t.Fatalf("CallTaskNonBlock failed: %v", err) + } + + if err = resp.Error(); err != nil { + t.Fatalf("Server responded with an error: %v", err) + } + + var respTestPayload testPayload + if err := resp.UnmarshalPayload(&respTestPayload); err != nil { + t.Fatalf("Unmarshalling payload error: %v", err) + } + + if resp.hdr.Unique != req.hdr.Unique { + t.Fatalf("got response for another request. Expected response for req %v but got response for req %v", + req.hdr.Unique, resp.hdr.Unique) + } + + if respTestPayload.data != testObj.data { + t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data) + } + +} + +// fuseServerRun creates a task and emulates all the actions of a simple FUSE server +// that simply reads a request and echos the same struct back as a response using the +// appropriate headers. +func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) { + defer func() { serverDone <- struct{}{} }() + + // Create the tasks that the server will be using. + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + var readPayload testPayload + + serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatal(err) + } + + // Read the request. + for { + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + payloadLen := uint32(readPayload.SizeBytes()) + + // The raed buffer must meet some certain size criteria. + buffSize := inHdrLen + payloadLen + if buffSize < linux.FUSE_MIN_READ_BUFFER { + buffSize = linux.FUSE_MIN_READ_BUFFER + } + inBuf := make([]byte, buffSize) + inIOseq := usermem.BytesIOSequence(inBuf) + + n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer) + if err != nil { + t.Fatalf("Read failed :%v", err) + } + + // Server should shut down. No new requests are going to be made. + if serverKilled { + break + } + + if n <= 0 { + t.Fatalf("Read read no bytes") + } + + var readFUSEHeaderIn linux.FUSEHeaderIn + readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen]) + readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen]) + + if readFUSEHeaderIn.Opcode != echoTestOpcode { + t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload) + } + + // Write the response. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + outBuf := make([]byte, outHdrLen+payloadLen) + outHeader := linux.FUSEHeaderOut{ + Len: outHdrLen + payloadLen, + Error: 0, + Unique: readFUSEHeaderIn.Unique, + } + + // Echo the payload back. + outHeader.MarshalUnsafe(outBuf[:outHdrLen]) + readPayload.MarshalUnsafe(outBuf[outHdrLen:]) + outIOseq := usermem.BytesIOSequence(outBuf) + + n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed :%v", err) + } + } +} diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go new file mode 100644 index 000000000..8f220a04b --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -0,0 +1,105 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type directoryFD struct { + fileDescription +} + +// Allocate implements directoryFD.Allocate. +func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error { + fusefs := dir.inode().fs + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + + in := linux.FUSEReadIn{ + Fh: dir.Fh, + Offset: uint64(atomic.LoadInt64(&dir.off)), + Size: linux.FUSE_PAGE_SIZE, + Flags: dir.statusFlags(), + } + + // TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS. + req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in) + if err != nil { + return err + } + + res, err := fusefs.conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + + var out linux.FUSEDirents + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + for _, fuseDirent := range out.Dirents { + nextOff := int64(fuseDirent.Meta.Off) + dirent := vfs.Dirent{ + Name: fuseDirent.Name, + Type: uint8(fuseDirent.Meta.Type), + Ino: fuseDirent.Meta.Ino, + NextOff: nextOff, + } + + if err := callback.Handle(dirent); err != nil { + return err + } + atomic.StoreInt64(&dir.off, nextOff) + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go new file mode 100644 index 000000000..83f2816b7 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/file.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fileDescription implements vfs.FileDescriptionImpl for fuse. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // the file handle used in userspace. + Fh uint64 + + // Nonseekable is indicate cannot perform seek on a file. + Nonseekable bool + + // DirectIO suggest fuse to use direct io operation. + DirectIO bool + + // OpenFlag is the flag returned by open. + OpenFlag uint32 + + // off is the file offset. + off int64 +} + +func (fd *fileDescription) dentry() *kernfs.Dentry { + return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry) +} + +func (fd *fileDescription) inode() *inode { + return fd.dentry().Inode().(*inode) +} + +func (fd *fileDescription) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *fileDescription) statusFlags() uint32 { + return fd.vfsfd.StatusFlags() +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fileDescription) Release(ctx context.Context) { + // no need to release if FUSE server doesn't implement Open. + conn := fd.inode().fs.conn + if conn.noOpen { + return + } + + in := linux.FUSEReleaseIn{ + Fh: fd.Fh, + Flags: fd.statusFlags(), + } + // TODO(gvisor.dev/issue/3245): add logic when we support file lock owner. + var opcode linux.FUSEOpcode + if fd.inode().Mode().IsDir() { + opcode = linux.FUSE_RELEASEDIR + } else { + opcode = linux.FUSE_RELEASE + } + kernelTask := kernel.TaskFromContext(ctx) + // ignoring errors and FUSE server reply is analogous to Linux's behavior. + req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in) + if err != nil { + // No way to invoke Call() with an errored request. + return + } + // The reply will be ignored since no callback is defined in asyncCallBack(). + conn.CallAsync(kernelTask, req) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, nil +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, nil +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + fs := fd.filesystem() + creds := auth.CredentialsFromContext(ctx) + return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh) +} diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go new file mode 100644 index 000000000..e7ef5998e --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -0,0 +1,822 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fuse implements fusefs. +package fuse + +import ( + "math" + "strconv" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Name is the default filesystem name. +const Name = "fuse" + +// maxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const maxActiveRequestsDefault = 10000 + +// FilesystemType implements vfs.FilesystemType. +// +// +stateify savable +type FilesystemType struct{} + +// +stateify savable +type filesystemOptions struct { + // userID specifies the numeric uid of the mount owner. + // This option should not be specified by the filesystem owner. + // It is set by libfuse (or, if libfuse is not used, must be set + // by the filesystem itself). For more information, see man page + // for fuse(8) + userID uint32 + + // groupID specifies the numeric gid of the mount owner. + // This option should not be specified by the filesystem owner. + // It is set by libfuse (or, if libfuse is not used, must be set + // by the filesystem itself). For more information, see man page + // for fuse(8) + groupID uint32 + + // rootMode specifies the the file mode of the filesystem's root. + rootMode linux.FileMode + + // maxActiveRequests specifies the maximum number of active requests that can + // exist at any time. Any further requests will block when trying to + // Call the server. + maxActiveRequests uint64 + + // maxRead is the max number of bytes to read, + // specified as "max_read" in fs parameters. + // If not specified by user, use math.MaxUint32 as default value. + maxRead uint32 +} + +// filesystem implements vfs.FilesystemImpl. +// +// +stateify savable +type filesystem struct { + kernfs.Filesystem + devMinor uint32 + + // conn is used for communication between the FUSE server + // daemon and the sentry fusefs. + conn *connection + + // opts is the options the fusefs is initialized with. + opts *filesystemOptions + + // umounted is true if filesystem.Release() has been called. + umounted bool +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + var fsopts filesystemOptions + mopts := vfs.GenericParseMountOptions(opts.Data) + deviceDescriptorStr, ok := mopts["fd"] + if !ok { + log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name()) + return nil, nil, syserror.EINVAL + } + delete(mopts, "fd") + + deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) + if err != nil { + return nil, nil, err + } + + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) + return nil, nil, syserror.EINVAL + } + fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor)) + + // Parse and set all the other supported FUSE mount options. + // TODO(gVisor.dev/issue/3229): Expand the supported mount options. + if userIDStr, ok := mopts["user_id"]; ok { + delete(mopts, "user_id") + userID, err := strconv.ParseUint(userIDStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr) + return nil, nil, syserror.EINVAL + } + fsopts.userID = uint32(userID) + } + + if groupIDStr, ok := mopts["group_id"]; ok { + delete(mopts, "group_id") + groupID, err := strconv.ParseUint(groupIDStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr) + return nil, nil, syserror.EINVAL + } + fsopts.groupID = uint32(groupID) + } + + rootMode := linux.FileMode(0777) + modeStr, ok := mopts["rootmode"] + if ok { + delete(mopts, "rootmode") + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) + return nil, nil, syserror.EINVAL + } + rootMode = linux.FileMode(mode) + } + fsopts.rootMode = rootMode + + // Set the maxInFlightRequests option. + fsopts.maxActiveRequests = maxActiveRequestsDefault + + if maxReadStr, ok := mopts["max_read"]; ok { + delete(mopts, "max_read") + maxRead, err := strconv.ParseUint(maxReadStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr) + return nil, nil, syserror.EINVAL + } + if maxRead < fuseMinMaxRead { + maxRead = fuseMinMaxRead + } + fsopts.maxRead = uint32(maxRead) + } else { + fsopts.maxRead = math.MaxUint32 + } + + // Check for unparsed options. + if len(mopts) != 0 { + log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts) + return nil, nil, syserror.EINVAL + } + + // Create a new FUSE filesystem. + fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + if err != nil { + log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) + return nil, nil, err + } + + fs.VFSFilesystem().Init(vfsObj, &fsType, fs) + + // Send a FUSE_INIT request to the FUSE daemon server before returning. + // This call is not blocking. + if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil { + log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err) + return nil, nil, err + } + + // root is the fusefs root directory. + root := fs.newRootInode(ctx, creds, fsopts.rootMode) + + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// newFUSEFilesystem creates a new FUSE filesystem. +func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + conn, err := newFUSEConnection(ctx, device, opts) + if err != nil { + log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) + return nil, syserror.EINVAL + } + + fuseFD := device.Impl().(*DeviceFD) + + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + conn: conn, + } + + fs.VFSFilesystem().IncRef() + fuseFD.fs = fs + + return fs, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + fs.conn.fd.mu.Lock() + + fs.umounted = true + fs.conn.Abort(ctx) + // Notify all the waiters on this fd. + fs.conn.fd.waitQueue.Notify(waiter.EventIn) + + fs.conn.fd.mu.Unlock() + + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release(ctx) +} + +// inode implements kernfs.Inode. +// +// +stateify savable +type inode struct { + inodeRefs + kernfs.InodeAlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.OrderedChildren + + // the owning filesystem. fs is immutable. + fs *filesystem + + // metaDataMu protects the metadata of this inode. + metadataMu sync.Mutex + + nodeID uint64 + + locks vfs.FileLocks + + // size of the file. + size uint64 + + // attributeVersion is the version of inode's attributes. + attributeVersion uint64 + + // attributeTime is the remaining vaild time of attributes. + attributeTime uint64 + + // version of the inode. + version uint64 + + // link is result of following a symbolic link. + link string +} + +func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { + i := &inode{fs: fs, nodeID: 1} + i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755) + i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.EnableLeakCheck() + + var d kernfs.Dentry + d.Init(&fs.Filesystem, i) + return &d +} + +func (fs *filesystem) newInode(ctx context.Context, nodeID uint64, attr linux.FUSEAttr) kernfs.Inode { + i := &inode{fs: fs, nodeID: nodeID} + creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)} + i.InodeAttrs.Init(ctx, &creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode)) + atomic.StoreUint64(&i.size, attr.Size) + i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.EnableLeakCheck() + return i +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + isDir := i.InodeAttrs.Mode().IsDir() + // return error if specified to open directory but inode is not a directory. + if !isDir && opts.Mode.IsDir() { + return nil, syserror.ENOTDIR + } + if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS { + return nil, syserror.EOVERFLOW + } + + var fd *fileDescription + var fdImpl vfs.FileDescriptionImpl + if isDir { + directoryFD := &directoryFD{} + fd = &(directoryFD.fileDescription) + fdImpl = directoryFD + } else { + regularFD := ®ularFileFD{} + fd = &(regularFD.fileDescription) + fdImpl = regularFD + } + // FOPEN_KEEP_CACHE is the defualt flag for noOpen. + fd.OpenFlag = linux.FOPEN_KEEP_CACHE + + // Only send open request when FUSE server support open or is opening a directory. + if !i.fs.conn.noOpen || isDir { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context") + return nil, syserror.EINVAL + } + + // Build the request. + var opcode linux.FUSEOpcode + if isDir { + opcode = linux.FUSE_OPENDIR + } else { + opcode = linux.FUSE_OPEN + } + + in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)} + if !i.fs.conn.atomicOTrunc { + in.Flags &= ^uint32(linux.O_TRUNC) + } + + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in) + if err != nil { + return nil, err + } + + // Send the request and receive the reply. + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return nil, err + } + if err := res.Error(); err == syserror.ENOSYS && !isDir { + i.fs.conn.noOpen = true + } else if err != nil { + return nil, err + } else { + out := linux.FUSEOpenOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return nil, err + } + + // Process the reply. + fd.OpenFlag = out.OpenFlag + if isDir { + fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO) + } + + fd.Fh = out.Fh + } + } + + // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode + fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0 + fdOptions := &vfs.FileDescriptionOptions{} + if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 { + fdOptions.DenyPRead = true + fdOptions.DenyPWrite = true + fd.Nonseekable = true + } + + // If we don't send SETATTR before open (which is indicated by atomicOTrunc) + // and O_TRUNC is set, update the inode's version number and clean existing data + // by setting the file size to 0. + if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 { + i.fs.conn.mu.Lock() + i.fs.conn.attributeVersion++ + i.attributeVersion = i.fs.conn.attributeVersion + atomic.StoreUint64(&i.size, 0) + i.fs.conn.mu.Unlock() + i.attributeTime = 0 + } + + if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Lookup implements kernfs.Inode.Lookup. +func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { + in := linux.FUSELookupIn{Name: name} + return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in) +} + +// Keep implements kernfs.Inode.Keep. +func (i *inode) Keep() bool { + // Return true so that kernfs keeps the new dentry pointing to this + // inode in the dentry tree. This is needed because inodes created via + // Lookup are not temporary. They might refer to existing files on server + // that can be Unlink'd/Rmdir'd. + return true +} + +// IterDirents implements kernfs.Inode.IterDirents. +func (*inode) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + +// NewFile implements kernfs.Inode.NewFile. +func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID) + return nil, syserror.EINVAL + } + in := linux.FUSECreateIn{ + CreateMeta: linux.FUSECreateMeta{ + Flags: opts.Flags, + Mode: uint32(opts.Mode) | linux.S_IFREG, + Umask: uint32(kernelTask.FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in) +} + +// NewNode implements kernfs.Inode.NewNode. +func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) { + in := linux.FUSEMknodIn{ + MknodMeta: linux.FUSEMknodMeta{ + Mode: uint32(opts.Mode), + Rdev: linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor), + Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in) +} + +// NewSymlink implements kernfs.Inode.NewSymlink. +func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) { + in := linux.FUSESymLinkIn{ + Name: name, + Target: target, + } + return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in) +} + +// Unlink implements kernfs.Inode.Unlink. +func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) + return syserror.EINVAL + } + in := linux.FUSEUnlinkIn{Name: name} + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in) + if err != nil { + return err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return err + } + // only return error, discard res. + return res.Error() +} + +// NewDir implements kernfs.Inode.NewDir. +func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { + in := linux.FUSEMkdirIn{ + MkdirMeta: linux.FUSEMkdirMeta{ + Mode: uint32(opts.Mode), + Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()), + }, + Name: name, + } + return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in) +} + +// RmDir implements kernfs.Inode.RmDir. +func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error { + fusefs := i.fs + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + + in := linux.FUSERmDirIn{Name: name} + req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in) + if err != nil { + return err + } + + res, err := i.fs.conn.Call(task, req) + if err != nil { + return err + } + return res.Error() +} + +// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response. +// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP. +func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID) + return nil, syserror.EINVAL + } + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload) + if err != nil { + return nil, err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return nil, err + } + if err := res.Error(); err != nil { + return nil, err + } + out := linux.FUSEEntryOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return nil, err + } + if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { + return nil, syserror.EIO + } + child := i.fs.newInode(ctx, out.NodeID, out.Attr) + return child, nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + path, err := i.Readlink(ctx, mnt) + return vfs.VirtualDentry{}, path, err +} + +// Readlink implements kernfs.Inode.Readlink. +func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if i.Mode().FileType()&linux.S_IFLNK == 0 { + return "", syserror.EINVAL + } + if len(i.link) == 0 { + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context") + return "", syserror.EINVAL + } + req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{}) + if err != nil { + return "", err + } + res, err := i.fs.conn.Call(kernelTask, req) + if err != nil { + return "", err + } + i.link = string(res.data[res.hdr.SizeBytes():]) + if !mnt.Options().ReadOnly { + i.attributeTime = 0 + } + } + return i.link, nil +} + +// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache. +// TODO(gvisor.dev/issue/3679): Add support for other fields. +func (i *inode) getFUSEAttr() linux.FUSEAttr { + return linux.FUSEAttr{ + Ino: i.Ino(), + Size: atomic.LoadUint64(&i.size), + Mode: uint32(i.Mode()), + } +} + +// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The +// opts.Sync attribute is ignored since the synchronization is handled by the +// FUSE server. +func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx { + var stat linux.Statx + stat.Blksize = attr.BlkSize + stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor + + rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev) + stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor + + if mask&linux.STATX_MODE != 0 { + stat.Mode = uint16(attr.Mode) + } + if mask&linux.STATX_NLINK != 0 { + stat.Nlink = attr.Nlink + } + if mask&linux.STATX_UID != 0 { + stat.UID = attr.UID + } + if mask&linux.STATX_GID != 0 { + stat.GID = attr.GID + } + if mask&linux.STATX_ATIME != 0 { + stat.Atime = linux.StatxTimestamp{ + Sec: int64(attr.Atime), + Nsec: attr.AtimeNsec, + } + } + if mask&linux.STATX_MTIME != 0 { + stat.Mtime = linux.StatxTimestamp{ + Sec: int64(attr.Mtime), + Nsec: attr.MtimeNsec, + } + } + if mask&linux.STATX_CTIME != 0 { + stat.Ctime = linux.StatxTimestamp{ + Sec: int64(attr.Ctime), + Nsec: attr.CtimeNsec, + } + } + if mask&linux.STATX_INO != 0 { + stat.Ino = attr.Ino + } + if mask&linux.STATX_SIZE != 0 { + stat.Size = attr.Size + } + if mask&linux.STATX_BLOCKS != 0 { + stat.Blocks = attr.Blocks + } + return stat +} + +// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request +// or read from local cache. It updates the corresponding attributes if +// necessary. +func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) { + attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion) + + // TODO(gvisor.dev/issue/3679): send the request only if + // - invalid local cache for fields specified in the opts.Mask + // - forced update + // - i.attributeTime expired + // If local cache is still valid, return local cache. + // Currently we always send a request, + // and we always set the metadata with the new result, + // unless attributeVersion has changed. + + task := kernel.TaskFromContext(ctx) + if task == nil { + log.Warningf("couldn't get kernel task from context") + return linux.FUSEAttr{}, syserror.EINVAL + } + + creds := auth.CredentialsFromContext(ctx) + + in := linux.FUSEGetAttrIn{ + GetAttrFlags: flags, + Fh: fh, + } + req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in) + if err != nil { + return linux.FUSEAttr{}, err + } + + res, err := i.fs.conn.Call(task, req) + if err != nil { + return linux.FUSEAttr{}, err + } + if err := res.Error(); err != nil { + return linux.FUSEAttr{}, err + } + + var out linux.FUSEGetAttrOut + if err := res.UnmarshalPayload(&out); err != nil { + return linux.FUSEAttr{}, err + } + + // Local version is newer, return the local one. + // Skip the update. + if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion { + return i.getFUSEAttr(), nil + } + + // Set the metadata of kernfs.InodeAttrs. + if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), + }); err != nil { + return linux.FUSEAttr{}, err + } + + // Set the size if no error (after SetStat() check). + atomic.StoreUint64(&i.size, out.Attr.Size) + + return out.Attr, nil +} + +// reviseAttr attempts to update the attributes for internal purposes +// by calling getAttr with a pre-specified mask. +// Used by read, write, lseek. +func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error { + // Never need atime for internal purposes. + _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME, + }, flags, fh) + return err +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + attr, err := i.getAttr(ctx, fs, opts, 0, 0) + if err != nil { + return linux.Statx{}, err + } + + return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil +} + +// DecRef implements kernfs.Inode.DecRef. +func (i *inode) DecRef(ctx context.Context) { + i.inodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + +// StatFS implements kernfs.Inode.StatFS. +func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs. + return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil +} + +// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask +// aligned with the attribute mask defined in include/linux/fs.h. +func fattrMaskFromStats(mask uint32) uint32 { + var fuseAttrMask uint32 + maskMap := map[uint32]uint32{ + linux.STATX_MODE: linux.FATTR_MODE, + linux.STATX_UID: linux.FATTR_UID, + linux.STATX_GID: linux.FATTR_GID, + linux.STATX_SIZE: linux.FATTR_SIZE, + linux.STATX_ATIME: linux.FATTR_ATIME, + linux.STATX_MTIME: linux.FATTR_MTIME, + linux.STATX_CTIME: linux.FATTR_CTIME, + } + for statxMask, fattrMask := range maskMap { + if mask&statxMask != 0 { + fuseAttrMask |= fattrMask + } + } + return fuseAttrMask +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return i.setAttr(ctx, fs, creds, opts, false, 0) +} + +func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error { + conn := i.fs.conn + task := kernel.TaskFromContext(ctx) + if task == nil { + log.Warningf("couldn't get kernel task from context") + return syserror.EINVAL + } + + // We should retain the original file type when assigning new mode. + fileType := uint16(i.Mode()) & linux.S_IFMT + fattrMask := fattrMaskFromStats(opts.Stat.Mask) + if useFh { + fattrMask |= linux.FATTR_FH + } + in := linux.FUSESetAttrIn{ + Valid: fattrMask, + Fh: fh, + Size: opts.Stat.Size, + Atime: uint64(opts.Stat.Atime.Sec), + Mtime: uint64(opts.Stat.Mtime.Sec), + Ctime: uint64(opts.Stat.Ctime.Sec), + AtimeNsec: opts.Stat.Atime.Nsec, + MtimeNsec: opts.Stat.Mtime.Nsec, + CtimeNsec: opts.Stat.Ctime.Nsec, + Mode: uint32(fileType | opts.Stat.Mode), + UID: opts.Stat.UID, + GID: opts.Stat.GID, + } + req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in) + if err != nil { + return err + } + + res, err := conn.Call(task, req) + if err != nil { + return err + } + if err := res.Error(); err != nil { + return err + } + out := linux.FUSEGetAttrOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + // Set the metadata of kernfs.InodeAttrs. + if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor), + }); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go new file mode 100644 index 000000000..2d396e84c --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -0,0 +1,244 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ReadInPages sends FUSE_READ requests for the size after round it up to +// a multiple of page size, blocks on it for reply, processes the reply +// and returns the payload (or joined payloads) as a byte slice. +// This is used for the general purpose reading. +// We do not support direct IO (which read the exact number of bytes) +// at this moment. +func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) { + attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion) + + t := kernel.TaskFromContext(ctx) + if t == nil { + log.Warningf("fusefs.Read: couldn't get kernel task from context") + return nil, 0, syserror.EINVAL + } + + // Round up to a multiple of page size. + readSize, _ := usermem.PageRoundUp(uint64(size)) + + // One request cannnot exceed either maxRead or maxPages. + maxPages := fs.conn.maxRead >> usermem.PageShift + if maxPages > uint32(fs.conn.maxPages) { + maxPages = uint32(fs.conn.maxPages) + } + + var outs [][]byte + var sizeRead uint32 + + // readSize is a multiple of usermem.PageSize. + // Always request bytes as a multiple of pages. + pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift) + + // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. + in := linux.FUSEReadIn{ + Fh: fd.Fh, + LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock + ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER + Flags: fd.statusFlags(), + } + + // This loop is intended for fragmented read where the bytes to read is + // larger than either the maxPages or maxRead. + // For the majority of reads with normal size, this loop should only + // execute once. + for pagesRead < pagesToRead { + pagesCanRead := pagesToRead - pagesRead + if pagesCanRead > maxPages { + pagesCanRead = maxPages + } + + in.Offset = off + (uint64(pagesRead) << usermem.PageShift) + in.Size = pagesCanRead << usermem.PageShift + + req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in) + if err != nil { + return nil, 0, err + } + + // TODO(gvisor.dev/issue/3247): support async read. + + res, err := fs.conn.Call(t, req) + if err != nil { + return nil, 0, err + } + if err := res.Error(); err != nil { + return nil, 0, err + } + + // Not enough bytes in response, + // either we reached EOF, + // or the FUSE server sends back a response + // that cannot even fit the hdr. + if len(res.data) <= res.hdr.SizeBytes() { + // We treat both case as EOF here for now + // since there is no reliable way to detect + // the over-short hdr case. + break + } + + // Directly using the slice to avoid extra copy. + out := res.data[res.hdr.SizeBytes():] + + outs = append(outs, out) + sizeRead += uint32(len(out)) + + pagesRead += pagesCanRead + } + + defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion) + + // No bytes returned: offset >= EOF. + if len(outs) == 0 { + return nil, 0, io.EOF + } + + return outs, sizeRead, nil +} + +// ReadCallback updates several information after receiving a read response. +// Due to readahead, sizeRead can be larger than size. +func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) { + // TODO(gvisor.dev/issue/3247): support async read. + // If this is called by an async read, correctly process it. + // May need to update the signature. + + i := fd.inode() + i.InodeAttrs.TouchAtime(ctx, fd.vfsfd.Mount()) + + // Reached EOF. + if sizeRead < size { + // TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole. + // Might need to update the buf to be returned from the Read(). + + // Update existing size. + newSize := off + uint64(sizeRead) + fs.conn.mu.Lock() + if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) { + fs.conn.attributeVersion++ + i.attributeVersion = i.fs.conn.attributeVersion + atomic.StoreUint64(&i.size, newSize) + } + fs.conn.mu.Unlock() + } +} + +// Write sends FUSE_WRITE requests and return the bytes +// written according to the response. +// +// Preconditions: len(data) == size. +func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + log.Warningf("fusefs.Read: couldn't get kernel task from context") + return 0, syserror.EINVAL + } + + // One request cannnot exceed either maxWrite or maxPages. + maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift + if maxWrite > fs.conn.maxWrite { + maxWrite = fs.conn.maxWrite + } + + // Reuse the same struct for unmarshalling to avoid unnecessary memory allocation. + in := linux.FUSEWriteIn{ + Fh: fd.Fh, + // TODO(gvisor.dev/issue/3245): file lock + LockOwner: 0, + // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER + // TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet) + WriteFlags: 0, + Flags: fd.statusFlags(), + } + + inode := fd.inode() + var written uint32 + + // This loop is intended for fragmented write where the bytes to write is + // larger than either the maxWrite or maxPages or when bigWrites is false. + // Unless a small value for max_write is explicitly used, this loop + // is expected to execute only once for the majority of the writes. + for written < size { + toWrite := size - written + + // Limit the write size to one page. + // Note that the bigWrites flag is obsolete, + // latest libfuse always sets it on. + if !fs.conn.bigWrites && toWrite > usermem.PageSize { + toWrite = usermem.PageSize + } + + // Limit the write size to maxWrite. + if toWrite > maxWrite { + toWrite = maxWrite + } + + in.Offset = off + uint64(written) + in.Size = toWrite + + req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), inode.nodeID, linux.FUSE_WRITE, &in) + if err != nil { + return 0, err + } + + req.payload = data[written : written+toWrite] + + // TODO(gvisor.dev/issue/3247): support async write. + + res, err := fs.conn.Call(t, req) + if err != nil { + return 0, err + } + if err := res.Error(); err != nil { + return 0, err + } + + out := linux.FUSEWriteOut{} + if err := res.UnmarshalPayload(&out); err != nil { + return 0, err + } + + // Write more than requested? EIO. + if out.Size > toWrite { + return 0, syserror.EIO + } + + written += out.Size + + // Break if short write. Not necessarily an error. + if out.Size != toWrite { + break + } + } + inode.InodeAttrs.TouchCMtime(ctx) + + return written, nil +} diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go new file mode 100644 index 000000000..b5b581152 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/register.go @@ -0,0 +1,42 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Register registers the FUSE device with vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "misc", + }); err != nil { + return err + } + + return nil +} + +// CreateDevtmpfsFile creates a device special file in devtmpfs. +func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { + if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { + return err + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go new file mode 100644 index 000000000..5bdd096c3 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -0,0 +1,230 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type regularFileFD struct { + fileDescription + + // off is the file offset. + off int64 + // offMu protects off. + offMu sync.Mutex +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + + size := dst.NumBytes() + if size == 0 { + // Early return if count is 0. + return 0, nil + } else if size > math.MaxUint32 { + // FUSE only supports uint32 for size. + // Overflow. + return 0, syserror.EINVAL + } + + // TODO(gvisor.dev/issue/3678): Add direct IO support. + + inode := fd.inode() + + // Reading beyond EOF, update file size if outdated. + if uint64(offset+size) > atomic.LoadUint64(&inode.size) { + if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil { + return 0, err + } + // If the offset after update is still too large, return error. + if uint64(offset) >= atomic.LoadUint64(&inode.size) { + return 0, io.EOF + } + } + + // Truncate the read with updated file size. + fileSize := atomic.LoadUint64(&inode.size) + if uint64(offset+size) > fileSize { + size = int64(fileSize) - offset + } + + buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size)) + if err != nil { + return 0, err + } + + // TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching), + // store the bytes that were read ahead. + + // Update the number of bytes to copy for short read. + if n < uint32(size) { + size = int64(n) + } + + // Copy the bytes read to the dst. + // This loop is intended for fragmented reads. + // For the majority of reads, this loop only execute once. + var copied int64 + for _, buffer := range buffers { + toCopy := int64(len(buffer)) + if copied+toCopy > size { + toCopy = size - copied + } + cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy]) + if err != nil { + return 0, err + } + if int64(cp) != toCopy { + return 0, syserror.EIO + } + copied += toCopy + } + + return copied, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off + fd.offMu.Unlock() + return n, err +} + +// pwrite returns the number of bytes written, final offset and error. The +// final offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { + if offset < 0 { + return 0, offset, syserror.EINVAL + } + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, offset, syserror.EOPNOTSUPP + } + + inode := fd.inode() + inode.metadataMu.Lock() + defer inode.metadataMu.Unlock() + + // If the file is opened with O_APPEND, update offset to file size. + // Note: since our Open() implements the interface of kernfs, + // and kernfs currently does not support O_APPEND, this will never + // be true before we switch out from kernfs. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Locking inode.metadataMu is sufficient for reading size + offset = int64(inode.size) + } + + srclen := src.NumBytes() + + if srclen > math.MaxUint32 { + // FUSE only supports uint32 for size. + // Overflow. + return 0, offset, syserror.EINVAL + } + if end := offset + srclen; end < offset { + // Overflow. + return 0, offset, syserror.EINVAL + } + + srclen, err = vfs.CheckLimit(ctx, offset, srclen) + if err != nil { + return 0, offset, err + } + + if srclen == 0 { + // Return before causing any side effects. + return 0, offset, nil + } + + src = src.TakeFirst64(srclen) + + // TODO(gvisor.dev/issue/3237): Add cache support: + // buffer cache. Ideally we write from src to our buffer cache first. + // The slice passed to fs.Write() should be a slice from buffer cache. + data := make([]byte, srclen) + // Reason for making a copy here: connection.Call() blocks on kerneltask, + // which in turn acquires mm.activeMu lock. Functions like CopyInTo() will + // attemp to acquire the mm.activeMu lock as well -> deadlock. + // We must finish reading from the userspace memory before + // t.Block() deactivates it. + cp, err := src.CopyIn(ctx, data) + if err != nil { + return 0, offset, err + } + if int64(cp) != srclen { + return 0, offset, syserror.EIO + } + + n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) + if err != nil { + return 0, offset, err + } + + if n == 0 { + // We have checked srclen != 0 previously. + // If err == nil, then it's a short write and we return EIO. + return 0, offset, syserror.EIO + } + + written = int64(n) + finalOff = offset + written + + if finalOff > int64(inode.size) { + atomic.StoreUint64(&inode.size, uint64(finalOff)) + atomic.AddUint64(&inode.fs.conn.attributeVersion, 1) + } + + return +} diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go new file mode 100644 index 000000000..7fa00569b --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/request_response.go @@ -0,0 +1,229 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE +// server may implement an older version of FUSE protocol, which contains a +// linux.FUSEInitOut with less attributes. +// +// Dynamically-sized objects cannot be marshalled. +type fuseInitRes struct { + marshal.StubMarshallable + + // initOut contains the response from the FUSE server. + initOut linux.FUSEInitOut + + // initLen is the total length of bytes of the response. + initLen uint32 +} + +// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes. +func (r *fuseInitRes) UnmarshalBytes(src []byte) { + out := &r.initOut + + // Introduced before FUSE kernel version 7.13. + out.Major = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + + // Introduced in FUSE kernel version 7.23. + if len(src) >= 4 { + out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4])) + src = src[4:] + } + // Introduced in FUSE kernel version 7.28. + if len(src) >= 2 { + out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2])) + src = src[2:] + } +} + +// SizeBytes is the size of the payload of the FUSE_INIT response. +func (r *fuseInitRes) SizeBytes() int { + return int(r.initLen) +} + +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte + + // payload for this request: extra bytes to write after + // the data slice. Used by FUSE_WRITE. + payload []byte + + // If this request is async. + async bool + // If we don't care its response. + // Manually set by the caller. + noReply bool +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + + // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. + hdr.MarshalBytes(buf[:hdrLen]) + payload.MarshalBytes(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte + + // If this request is async. + async bool +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(req *Request) *futureResponse { + return &futureResponse{ + opcode: req.hdr.Opcode, + ch: make(chan struct{}), + async: req.async, + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // Return directly for async requests. + if f.async { + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// DataLen returns the size of the response without the header. +func (r *Response) DataLen() uint32 { + return r.hdr.Len - uint32(r.hdr.SizeBytes()) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + // The response data is empty unless there is some payload. And so, doesn't + // need to be unmarshalled. + if r.data == nil { + return nil + } + + // TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again. + m.UnmarshalBytes(r.data[hdrLen:]) + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go new file mode 100644 index 000000000..e1d9e3365 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/utils_test.go @@ -0,0 +1,132 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "io" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +func setup(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Error creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + AllowUserMount: true, + }) + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + + return testutil.NewSystem(ctx, t, k.VFS(), mntns) +} + +// newTestConnection creates a fuse connection that the sentry can communicate with +// and the FD for the server to communicate with. +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { + vfsObj := &vfs.VirtualFilesystem{} + fuseDev := &DeviceFD{} + + if err := vfsObj.Init(system.Ctx); err != nil { + return nil, nil, err + } + + vd := vfsObj.NewAnonVirtualDentry("genCountFD") + defer vd.DecRef(system.Ctx) + if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, nil, err + } + + fsopts := filesystemOptions{ + maxActiveRequests: maxActiveRequests, + } + fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd) + if err != nil { + return nil, nil, err + } + + return fs.conn, &fuseDev.vfsfd, nil +} + +type testPayload struct { + marshal.StubMarshallable + data uint32 +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (t *testPayload) SizeBytes() int { + return 4 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (t *testPayload) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], t.data) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (t *testPayload) UnmarshalBytes(src []byte) { + *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])} +} + +// Packed implements marshal.Marshallable.Packed. +func (t *testPayload) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (t *testPayload) MarshalUnsafe(dst []byte) { + t.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (t *testPayload) UnmarshalUnsafe(src []byte) { + t.UnmarshalBytes(src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) { + panic("not implemented") +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// WriteTo implements io.WriterTo.WriteTo. +func (t *testPayload) WriteTo(w io.Writer) (int64, error) { + panic("not implemented") +} diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index f5f35a3bc..4c3e9acf8 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -38,6 +38,7 @@ go_library( "host_named_pipe.go", "p9file.go", "regular_file.go", + "save_restore.go", "socket.go", "special_file.go", "symlink.go", @@ -52,8 +53,11 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/p9", + "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/host", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", @@ -68,6 +72,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", @@ -84,5 +89,6 @@ go_test( deps = [ "//pkg/p9", "//pkg/sentry/contexttest", + "//pkg/sentry/pgalloc", ], ) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index b98218753..e993c8e36 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -16,16 +16,17 @@ package gofer import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -34,8 +35,11 @@ func (d *dentry) isDir() bool { return d.fileType() == linux.S_IFDIR } -// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked. -// d.isDir(). child must be a newly-created dentry that has never had a parent. +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +// * child must be a newly-created dentry that has never had a parent. func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.IncRef() // reference held by child on its parent child.parent = d @@ -46,7 +50,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.children[name] = child } -// Preconditions: d.dirMu must be locked. d.isDir(). +// Preconditions: +// * d.dirMu must be locked. +// * d.isDir(). func (d *dentry) cacheNegativeLookupLocked(name string) { // Don't cache negative lookups if InteropModeShared is in effect (since // this makes remote lookup unavoidable), or if d.isSynthetic() (in which @@ -79,49 +85,54 @@ type createSyntheticOpts struct { // createSyntheticChildLocked creates a synthetic file with the given name // in d. // -// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain -// a child with the given name. +// Preconditions: +// * d.dirMu must be locked. +// * d.isDir(). +// * d does not already contain a child with the given name. func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { - d2 := &dentry{ + child := &dentry{ refs: 1, // held by d fs: d.fs, + ino: d.fs.nextIno(), mode: uint32(opts.mode), uid: uint32(opts.kuid), gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary - handle: handle{ - fd: -1, - }, - nlink: uint32(2), + hostFD: -1, + nlink: uint32(2), + } + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Register(child, "gofer.dentry") } switch opts.mode.FileType() { case linux.S_IFDIR: // Nothing else needs to be done. case linux.S_IFSOCK: - d2.endpoint = opts.endpoint + child.endpoint = opts.endpoint case linux.S_IFIFO: - d2.pipe = opts.pipe + child.pipe = opts.pipe default: panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) } - d2.pf.dentry = d2 - d2.vfsd.Init(d2) + child.pf.dentry = child + child.vfsd.Init(child) - d.cacheNewChildLocked(d2, opts.name) + d.cacheNewChildLocked(child, opts.name) d.syntheticChildren++ } +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 dirents []vfs.Dirent } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -138,6 +149,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } + d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } @@ -151,7 +163,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba return nil } -// Preconditions: d.isDir(). There exists at least one directoryFD representing d. +// Preconditions: +// * d.isDir(). +// * There exists at least one directoryFD representing d. func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the // presence of concurrent mutation of an iterated directory, so @@ -183,13 +197,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { { Name: ".", Type: linux.DT_DIR, - Ino: d.ino, + Ino: uint64(d.ino), NextOff: 1, }, { Name: "..", Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), - Ino: parent.ino, + Ino: uint64(parent.ino), NextOff: 2, }, } @@ -203,14 +217,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() - if !d.handleReadable { + if d.readFile.isNil() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. d.handleMu.RUnlock() panic("gofer.dentry.getDirents called without a readable handle") } for { - p9ds, err := d.handle.file.readdir(ctx, off, count) + p9ds, err := d.readFile.readdir(ctx, off, count) if err != nil { d.handleMu.RUnlock() return nil, err @@ -225,7 +239,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { } dirent := vfs.Dirent{ Name: p9d.Name, - Ino: p9d.QID.Path, + Ino: d.fs.inoFromQIDPath(p9d.QID.Path), NextOff: int64(len(dirents) + 1), } // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or @@ -258,7 +272,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { dirents = append(dirents, vfs.Dirent{ Name: child.name, Type: uint8(atomic.LoadUint32(&child.mode) >> 12), - Ino: child.ino, + Ino: uint64(child.ino), NextOff: int64(len(dirents) + 1), }) } @@ -299,3 +313,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in return 0, syserror.EINVAL } } + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *directoryFD) Sync(ctx context.Context) error { + return fd.dentry().syncRemoteFile(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 36e0e1856..baecb88c4 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -15,7 +15,9 @@ package gofer import ( + "math" "sync" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -33,7 +35,7 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { - // Snapshot current syncable dentries and special files. + // Snapshot current syncable dentries and special file FDs. fs.syncMu.Lock() ds := make([]*dentry, 0, len(fs.syncableDentries)) for d := range fs.syncableDentries { @@ -51,22 +53,28 @@ func (fs *filesystem) Sync(ctx context.Context) error { // regardless. var retErr error - // Sync regular files. + // Sync syncable dentries. for _, d := range ds { - err := d.syncSharedHandle(ctx) - d.DecRef() - if err != nil && retErr == nil { - retErr = err + err := d.syncCachedFile(ctx, true /* forFilesystemSync */) + d.DecRef(ctx) + if err != nil { + ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) + if retErr == nil { + retErr = err + } } } // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - err := sffd.Sync(ctx) - sffd.vfsfd.DecRef() - if err != nil && retErr == nil { - retErr = err + err := sffd.sync(ctx, true /* forFilesystemSync */) + sffd.vfsfd.DecRef(ctx) + if err != nil { + ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) + if retErr == nil { + retErr = err + } } } @@ -113,12 +121,15 @@ func putDentrySlice(ds *[]*dentry) { // Dentries which may become cached as a result of the traversal are appended // to *ds. // -// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata -// must be up to date. +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !rp.Done(). +// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up +// to date. // // Postconditions: The returned dentry's cached metadata is up to date. -func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } @@ -132,7 +143,7 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() @@ -145,15 +156,13 @@ afterSymlink: // // Call rp.CheckMount() before updating d.parent's metadata, since if // we traverse to another mount then d.parent's metadata is irrelevant. - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } if d != d.parent && !d.cachedMetadataAuthoritative() { - _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) - if err != nil { + if err := d.parent.updateFromGetattr(ctx); err != nil { return nil, err } - d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() return d.parent, nil @@ -165,10 +174,10 @@ afterSymlink: if child == nil { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } - if child.isSymlink() && rp.ShouldFollowSymlink() { + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { return nil, err @@ -185,8 +194,11 @@ afterSymlink: // getChildLocked returns a dentry representing the child of parent with the // given name. If no such child exists, getChildLocked returns (nil, nil). // -// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. -// parent.isDir(). name is not "." or "..". +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +// * parent.isDir(). +// * name is not "." or "..". // // Postconditions: If getChildLocked returns a non-nil dentry, its cached // metadata is up to date. @@ -206,20 +218,31 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) } -// Preconditions: As for getChildLocked. !parent.isSynthetic(). +// Preconditions: Same as getChildLocked, plus: +// * !parent.isSynthetic(). func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { + if child != nil { + // Need to lock child.metadataMu because we might be updating child + // metadata. We need to hold the lock *before* getting metadata from the + // server and release it after updating local metadata. + child.metadataMu.Lock() + } qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { + if child != nil { + child.metadataMu.Unlock() + } return nil, err } if child != nil { - if !file.isNil() && qid.Path == child.ino { - // The file at this path hasn't changed. Just update cached - // metadata. + if !file.isNil() && qid.Path == child.qidPath { + // The file at this path hasn't changed. Just update cached metadata. file.close(ctx) - child.updateFromP9Attrs(attrMask, &attr) + child.updateFromP9AttrsLocked(attrMask, &attr) + child.metadataMu.Unlock() return child, nil } + child.metadataMu.Unlock() if file.isNil() && child.isSynthetic() { // We have a synthetic file, and no remote file has arisen to // replace it. @@ -230,7 +253,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // has 0 references, drop it). Wait to update parent.children until we // know what to replace the existing dentry with (i.e. one of the // returns below), to avoid a redundant map access. - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) if child.isSynthetic() { // Normally we don't mark invalidated dentries as deleted since // they may still exist (but at a different path), and also for @@ -269,13 +292,15 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). If -// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to -// date. +// Preconditions: +// * fs.renameMu must be locked. +// * !rp.Done(). +// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up +// to date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() - next, err := fs.stepLocked(ctx, rp, d, ds) + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) d.dirMu.Unlock() if err != nil { return nil, err @@ -301,7 +326,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } for !rp.Done() { d.dirMu.Lock() - next, err := fs.stepLocked(ctx, rp, d, ds) + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) d.dirMu.Unlock() if err != nil { return nil, err @@ -318,12 +343,13 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, // createInRemoteDir (if the parent directory is a real remote directory) or // createInSyntheticDir (if the parent directory is synthetic) to do so. // -// Preconditions: !rp.Done(). For the final path component in rp, -// !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { +// Preconditions: +// * !rp.Done(). +// * For the final path component in rp, !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -371,24 +397,40 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } parent.touchCMtime() parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if fs.opts.interop == InteropModeShared { - // The existence of a dentry at name would be inconclusive because the - // file it represents may have been deleted from the remote filesystem, - // so we would need to make an RPC to revalidate the dentry. Just - // attempt the file creation RPC instead. If a file does exist, the RPC - // will fail with EEXIST like we would have. If the RPC succeeds, and a - // stale dentry exists, the dentry will fail revalidation next time - // it's used. - return createInRemoteDir(parent, name) + if child := parent.children[name]; child != nil && child.isSynthetic() { + return syserror.EEXIST + } + // The existence of a non-synthetic dentry at name would be inconclusive + // because the file it represents may have been deleted from the remote + // filesystem, so we would need to make an RPC to revalidate the dentry. + // Just attempt the file creation RPC instead. If a file does exist, the + // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a + // stale dentry exists, the dentry will fail revalidation next time it's + // used. + if err := createInRemoteDir(parent, name, &ds); err != nil { + return err + } + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + return nil } if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file // at name. As above, we attempt the file creation RPC anyway. - if err := createInRemoteDir(parent, name); err != nil { + if err := createInRemoteDir(parent, name, &ds); err != nil { return err } if child, ok := parent.children[name]; ok && child == nil { @@ -397,6 +439,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } parent.touchCMtime() parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -404,7 +451,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -440,24 +487,64 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() + child, ok := parent.children[name] if ok && child == nil { return syserror.ENOENT } - // We only need a dentry representing the file at name if it can be a mount - // point. If child is nil, then it can't be a mount point. If child is - // non-nil but stale, the actual file can't be a mount point either; we - // detect this case by just speculatively calling PrepareDeleteDentry and - // only revalidating the dentry if that fails (indicating that the existing - // dentry is a mount point). + + sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0 + if sticky { + if !ok { + // If the sticky bit is set, we need to retrieve the child to determine + // whether removing it is allowed. + child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err != nil { + return err + } + } else if child != nil && !child.cachedMetadataAuthoritative() { + // Make sure the dentry representing the file at name is up to date + // before examining its metadata. + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) + if err != nil { + return err + } + } + if err := parent.mayDelete(rp.Credentials(), child); err != nil { + return err + } + } + + // If a child dentry exists, prepare to delete it. This should fail if it is + // a mount point. We detect mount points by speculatively calling + // PrepareDeleteDentry, which fails if child is a mount point. However, we + // may need to revalidate the file in this case to make sure that it has not + // been deleted or replaced on the remote fs, in which case the mount point + // will have disappeared. If calling PrepareDeleteDentry fails again on the + // up-to-date dentry, we can be sure that it is a mount point. + // + // Also note that if child is nil, then it can't be a mount point. if child != nil { + // Hold child.dirMu so we can check child.children and + // child.syntheticChildren. We don't access these fields until a bit later, + // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would + // create an inconsistent lock ordering between dentry.dirMu and + // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a + // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between + // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock + // child.dirMu before calling PrepareDeleteDentry. child.dirMu.Lock() defer child.dirMu.Unlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { - if parent.cachedMetadataAuthoritative() { + // We can skip revalidation in several cases: + // - We are not in InteropModeShared + // - The parent directory is synthetic, in which case the child must also + // be synthetic + // - We already updated the child during the sticky bit check above + if parent.cachedMetadataAuthoritative() || sticky { return err } child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) @@ -518,7 +605,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if child == nil { return syserror.ENOENT } - } else { + } else if child == nil || !child.isSynthetic() { err = parent.file.unlinkAt(ctx, name, flags) if err != nil { if child != nil { @@ -527,8 +614,20 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return err } } + + // Generate inotify events for rmdir or unlink. + if dir { + parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + } else { + var cw *vfs.Watches + if child != nil { + cw = &child.watches + } + vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) + } + if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- @@ -555,7 +654,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -563,20 +662,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -586,7 +685,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -598,7 +697,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -619,7 +718,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -638,19 +737,40 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } - // 9P2000.L supports hard links, but we don't. - return syserror.EPERM + d := vd.Dentry().Impl().(*dentry) + if d.isDir() { + return syserror.EPERM + } + gid := auth.KGID(atomic.LoadUint32(&d.gid)) + uid := auth.KUID(atomic.LoadUint32(&d.uid)) + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { + return err + } + if d.nlink == 0 { + return syserror.ENOENT + } + if d.nlink == math.MaxUint32 { + return syserror.EMLINK + } + if err := parent.file.link(ctx, d.file, childName); err != nil { + return err + } + + // Success! + atomic.AddUint32(&d.nlink, 1) + return nil }, nil) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { creds := rp.Credentials() - return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error { if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { return err @@ -685,34 +805,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error { creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - // If the gofer does not allow creating a socket or pipe, create a - // synthetic one, i.e. one that is kept entirely in memory. - if err == syserror.EPERM { - switch opts.Mode.FileType() { - case linux.S_IFSOCK: - parent.createSyntheticChildLocked(&createSyntheticOpts{ - name: name, - mode: opts.Mode, - kuid: creds.EffectiveKUID, - kgid: creds.EffectiveKGID, - endpoint: opts.Endpoint, - }) - return nil - case linux.S_IFIFO: - parent.createSyntheticChildLocked(&createSyntheticOpts{ - name: name, - mode: opts.Mode, - kuid: creds.EffectiveKUID, - kgid: creds.EffectiveKGID, - pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), - }) - return nil - } + if err != syserror.EPERM { + return err } - return err + + // EPERM means that gofer does not allow creating a socket or pipe. Fallback + // to creating a synthetic one, i.e. one that is kept entirely in memory. + + // Check that we're not overriding an existing file with a synthetic one. + _, err = fs.stepLocked(ctx, rp, parent, true, ds) + switch { + case err == nil: + // Step succeeded, another file exists. + return syserror.EEXIST + case err != syserror.ENOENT: + // Unexpected error. + return err + } + + switch opts.Mode.FileType() { + case linux.S_IFSOCK: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + endpoint: opts.Endpoint, + }) + return nil + case linux.S_IFIFO: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + }) + return nil + } + // Retain error from gofer if synthetic file cannot be created internally. + return syserror.EPERM }, nil) } @@ -730,7 +865,14 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + unlocked := false + unlock := func() { + if !unlocked { + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + unlocked = true + } + } + defer unlock() start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { @@ -740,7 +882,17 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } } if rp.Done() { - return start.openLocked(ctx, rp, &opts) + // Reject attempts to open mount root directory with O_CREAT. + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + start.IncRef() + defer start.DecRef(ctx) + unlock() + return start.open(ctx, rp, &opts) } afterTrailingSymlink: @@ -752,9 +904,13 @@ afterTrailingSymlink: if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } + // Reject attempts to open directories with O_CREAT. + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } // Determine whether or not we need to create a file. parent.dirMu.Lock() - child, err := fs.stepLocked(ctx, rp, parent, &ds) + child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) if err == syserror.ENOENT && mayCreate { if parent.isSynthetic() { parent.dirMu.Unlock() @@ -764,15 +920,14 @@ afterTrailingSymlink: parent.dirMu.Unlock() return fd, err } + parent.dirMu.Unlock() if err != nil { - parent.dirMu.Unlock() return nil, err } - // Open existing child or follow symlink. - parent.dirMu.Unlock() if mustCreate { return nil, syserror.EEXIST } + // Open existing child or follow symlink. if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { @@ -784,29 +939,48 @@ afterTrailingSymlink: start = parent goto afterTrailingSymlink } - return child.openLocked(ctx, rp, &opts) + if rp.MustBeDir() && !child.isDir() { + return nil, syserror.ENOTDIR + } + child.IncRef() + defer child.DecRef(ctx) + unlock() + return child.open(ctx, rp, &opts) } -// Preconditions: fs.renameMu must be locked. -func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { +// Preconditions: The caller must hold no locks (since opening pipes may block +// indefinitely). +func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } + + trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG + if trunc { + // Lock metadataMu *while* we open a regular file with O_TRUNC because + // open(2) will change the file size on server. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + } + + var vfd *vfs.FileDescription + var err error mnt := rp.Mount() switch d.fileType() { case linux.S_IFREG: if !d.fs.opts.regularFilesUseSpecialFileFD { - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil { return nil, err } fd := ®ularFileFD{} + fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err } - return &fd.vfsfd, nil + vfd = &fd.vfsfd } case linux.S_IFDIR: // Can't open directories with O_CREAT. @@ -826,6 +1000,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } } fd := &directoryFD{} + fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -838,17 +1013,35 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, syserror.ENXIO } if d.fs.iopts.OpenSocketsByConnecting { - return d.connectSocketLocked(ctx, opts) + return d.openSocketByConnecting(ctx, opts) } case linux.S_IFIFO: if d.isSynthetic() { - return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) + } + } + + if vfd == nil { + if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil { + return nil, err } } - return d.openSpecialFileLocked(ctx, mnt, opts) + + if trunc { + // If no errors occured so far then update file size in memory. This + // step is required even if !d.cachedMetadataAuthoritative() because + // d.mappings has to be updated. + // d.metadataMu has already been acquired if trunc == true. + d.updateSizeLocked(0) + + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + } + return vfd, err } -func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } @@ -868,7 +1061,7 @@ func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) return fd, nil } -func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL @@ -902,7 +1095,7 @@ retry: return nil, err } } - fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags) if err != nil { h.close(ctx) return nil, err @@ -910,8 +1103,10 @@ retry: return &fd.vfsfd, nil } -// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. -// !d.isSynthetic(). +// Preconditions: +// * d.fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !d.isSynthetic(). func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -934,10 +1129,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } creds := rp.Credentials() name := rp.Component() - // Filter file creation flags and O_LARGEFILE out; the create RPC already - // has the semantics of O_CREAT|O_EXCL, while some servers will choke on - // O_LARGEFILE. - createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE)) + // We only want the access mode for creating the file. + createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) @@ -970,12 +1163,18 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { child.handleMu.Lock() - child.handle.file = openFile - if fdobj != nil { - child.handle.fd = int32(fdobj.Release()) + if vfs.MayReadFileWithOpenFlags(opts.Flags) { + child.readFile = openFile + if fdobj != nil { + child.hostFD = int32(fdobj.Release()) + } + } else if fdobj != nil { + // Can't use fdobj if it's not readable. + fdobj.Close() + } + if vfs.MayWriteFileWithOpenFlags(opts.Flags) { + child.writeFile = openFile } - child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) - child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) child.handleMu.Unlock() } // Insert the dentry into the tree. @@ -989,6 +1188,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving var childVFSFD *vfs.FileDescription if useRegularFileFD { fd := ®ularFileFD{} + fd.LockFD.Init(&child.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { @@ -1003,13 +1203,14 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving if fdobj != nil { h.fd = int32(fdobj.Release()) } - fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags) if err != nil { h.close(ctx) return nil, err } childVFSFD = &fd.vfsfd } + d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } @@ -1017,7 +1218,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1037,7 +1238,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckCaching(&ds) + defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -1061,7 +1262,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return err } } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + creds := rp.Credentials() + if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() @@ -1076,12 +1278,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if renamed == nil { return syserror.ENOENT } + if err := oldParent.mayDelete(creds, renamed); err != nil { + return err + } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return syserror.EINVAL } if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { return err } } @@ -1092,7 +1297,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.Lock() @@ -1112,6 +1317,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if !renamed.isDir() { return syserror.EISDIR } + if genericIsAncestorDentry(replaced, renamed) { + return syserror.ENOTEMPTY + } } else { if rp.MustBeDir() || renamed.isDir() { return syserror.ENOTDIR @@ -1123,7 +1331,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return nil } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } @@ -1148,7 +1356,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Update the dentry tree. - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) if replaced != nil { replaced.setDeleted() if replaced.isSynthetic() { @@ -1162,14 +1370,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // with reference counts and queue oldParent for checkCachingLocked if the // parent isn't actually changing. if oldParent != newParent { + oldParent.decRefLocked() ds = appendDentry(ds, oldParent) newParent.IncRef() if renamed.isSynthetic() { oldParent.syntheticChildren-- newParent.syntheticChildren++ } + renamed.parent = newParent } - renamed.parent = newParent renamed.name = newName if newParent.children == nil { newParent.children = make(map[string]*dentry) @@ -1190,10 +1399,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.cachedMetadataAuthoritative() { newParent.dirents = nil newParent.touchCMtime() - if renamed.isDir() { + if renamed.isDir() && (replaced == nil || !replaced.isDir()) { + // Increase the link count if we did not replace another directory. newParent.incLinks() } } + vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) return nil } @@ -1206,19 +1417,28 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()) + err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + if err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) + } + return nil } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1235,7 +1455,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1269,7 +1489,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error { creds := rp.Credentials() _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err @@ -1281,11 +1501,11 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return fs.unlinkAt(ctx, rp, false /* dir */) } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1298,61 +1518,76 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath d.IncRef() return &endpoint{ dentry: d, - file: d.file.file, path: opts.Addr, }, nil } - return d.endpoint, nil + if d.endpoint != nil { + return d.endpoint, nil + } } return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } - return d.listxattr(ctx, rp.Credentials(), size) + return d.listXattr(ctx, rp.Credentials(), size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err } - return d.getxattr(ctx, rp.Credentials(), &opts) + return d.getXattr(ctx, rp.Credentials(), &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + return err + } + err = d.setXattr(ctx, rp.Credentials(), &opts) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + if err != nil { return err } - return d.setxattr(ctx, rp.Credentials(), &opts) + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - return d.removexattr(ctx, rp.Credentials(), name) + err = d.removeXattr(ctx, rp.Credentials(), name) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 3f3bd56f0..80668ebc1 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -26,6 +26,9 @@ // *** "memmap.Mappable locks taken by Translate" below this point // dentry.handleMu // dentry.dataMu +// filesystem.inoMu +// specialFileFD.mu +// specialFileFD.bufMu // // Locking dentry.dirMu in multiple dentries requires that either ancestor // dentries are locked before descendant dentries, or that filesystem.renameMu @@ -36,7 +39,6 @@ import ( "fmt" "strconv" "strings" - "sync" "sync/atomic" "syscall" @@ -44,7 +46,10 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -52,6 +57,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/usermem" @@ -61,9 +67,13 @@ import ( const Name = "9p" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem @@ -76,7 +86,7 @@ type filesystem struct { iopts InternalFilesystemOptions // client is the client used by this filesystem. client is immutable. - client *p9.Client + client *p9.Client `state:"nosave"` // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock @@ -84,6 +94,9 @@ type filesystem struct { // devMinor is the filesystem's minor device number. devMinor is immutable. devMinor uint32 + // root is the root dentry. root is immutable. + root *dentry + // renameMu serves two purposes: // // - It synchronizes path resolution with renaming initiated by this @@ -94,23 +107,42 @@ type filesystem struct { // reference count (such that it is usable as vfs.ResolvingPath.Start() or // is reachable from its children), or if it is a child dentry (such that // it is reachable from its parent). - renameMu sync.RWMutex + renameMu sync.RWMutex `state:"nosave"` // cachedDentries contains all dentries with 0 references. (Due to race // conditions, it may also contain dentries with non-zero references.) - // cachedDentriesLen is the number of dentries in cachedDentries. These - // fields are protected by renameMu. + // cachedDentriesLen is the number of dentries in cachedDentries. These fields + // are protected by renameMu. cachedDentries dentryList cachedDentriesLen uint64 - // syncableDentries contains all dentries in this filesystem for which - // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. - // These fields are protected by syncMu. - syncMu sync.Mutex + // syncableDentries contains all non-synthetic dentries. specialFileFDs + // contains all open specialFileFDs. These fields are protected by syncMu. + syncMu sync.Mutex `state:"nosave"` syncableDentries map[*dentry]struct{} specialFileFDs map[*specialFileFD]struct{} + + // inoByQIDPath maps previously-observed QID.Paths to inode numbers + // assigned to those paths. inoByQIDPath is not preserved across + // checkpoint/restore because QIDs may be reused between different gofer + // processes, so QIDs may be repeated for different files across + // checkpoint/restore. inoByQIDPath is protected by inoMu. + inoMu sync.Mutex `state:"nosave"` + inoByQIDPath map[uint64]uint64 `state:"nosave"` + + // lastIno is the last inode number assigned to a file. lastIno is accessed + // using atomic memory operations. + lastIno uint64 + + // savedDentryRW records open read/write handles during save/restore. + savedDentryRW map[*dentry]savedDentryRW + + // released is nonzero once filesystem.Release has been called. It is accessed + // with atomic memory operations. + released int32 } +// +stateify savable type filesystemOptions struct { // "Standard" 9P options. fd int @@ -121,8 +153,7 @@ type filesystemOptions struct { msize uint32 version string - // maxCachedDentries is the maximum number of dentries with 0 references - // retained by the client. + // maxCachedDentries is the maximum size of filesystem.cachedDentries. maxCachedDentries uint64 // If forcePageCache is true, host FDs may not be used for application @@ -156,6 +187,8 @@ type filesystemOptions struct { // InteropMode controls the client's interaction with other remote filesystem // users. +// +// +stateify savable type InteropMode uint32 const ( @@ -171,10 +204,10 @@ const ( // // - File timestamps are based on client clocks. This ensures that users of // the client observe timestamps that are coherent with their own clocks - // and consistent with Linux's semantics. However, since it is not always - // possible for clients to set arbitrary atimes and mtimes, and never - // possible for clients to set arbitrary ctimes, file timestamp changes are - // stored in the client only and never sent to the remote filesystem. + // and consistent with Linux's semantics (in particular, it is not always + // possible for clients to set arbitrary atimes and mtimes depending on the + // remote filesystem implementation, and never possible for clients to set + // arbitrary ctimes.) InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of @@ -214,7 +247,13 @@ const ( // InternalFilesystemOptions may be passed as // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +// +// +stateify savable type InternalFilesystemOptions struct { + // If UniqueID is non-empty, it is an opaque string used to reassociate the + // filesystem with a new server FD during restoration from checkpoint. + UniqueID string + // If LeakConnection is true, do not close the connection to the server // when the Filesystem is released. This is necessary for deployments in // which servers can handle only a single client and report failure if that @@ -240,6 +279,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) @@ -251,46 +293,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt mopts := vfs.GenericParseMountOptions(opts.Data) var fsopts filesystemOptions - // Check that the transport is "fd". - trans, ok := mopts["trans"] - if !ok { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'") - return nil, nil, syserror.EINVAL - } - delete(mopts, "trans") - if trans != "fd" { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans) - return nil, nil, syserror.EINVAL - } - - // Check that read and write FDs are provided and identical. - rfdstr, ok := mopts["rfdno"] - if !ok { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>") - return nil, nil, syserror.EINVAL - } - delete(mopts, "rfdno") - rfd, err := strconv.Atoi(rfdstr) + fd, err := getFDFromMountOptionsMap(ctx, mopts) if err != nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr) - return nil, nil, syserror.EINVAL - } - wfdstr, ok := mopts["wfdno"] - if !ok { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>") - return nil, nil, syserror.EINVAL - } - delete(mopts, "wfdno") - wfd, err := strconv.Atoi(wfdstr) - if err != nil { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr) - return nil, nil, syserror.EINVAL - } - if rfd != wfd { - ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd) - return nil, nil, syserror.EINVAL + return nil, nil, err } - fsopts.fd = rfd + fsopts.fd = fd // Get the attach name. fsopts.aname = "/" @@ -406,85 +413,133 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // If !ok, iopts being the zero value is correct. - // Establish a connection with the server. - conn, err := unet.NewSocket(fsopts.fd) + // Construct the filesystem object. + devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } + fs := &filesystem{ + mfp: mfp, + opts: fsopts, + iopts: iopts, + clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + syncableDentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), + inoByQIDPath: make(map[uint64]uint64), + } + fs.vfsfs.Init(vfsObj, &fstype, fs) - // Perform version negotiation with the server. - ctx.UninterruptibleSleepStart(false) - client, err := p9.NewClient(conn, fsopts.msize, fsopts.version) - ctx.UninterruptibleSleepFinish(false) - if err != nil { - conn.Close() + // Connect to the server. + if err := fs.dial(ctx); err != nil { return nil, nil, err } - // Ownership of conn has been transferred to client. // Perform attach to obtain the filesystem root. ctx.UninterruptibleSleepStart(false) - attached, err := client.Attach(fsopts.aname) + attached, err := fs.client.Attach(fsopts.aname) ctx.UninterruptibleSleepFinish(false) if err != nil { - client.Close() + fs.vfsfs.DecRef(ctx) return nil, nil, err } attachFile := p9file{attached} qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) if err != nil { attachFile.close(ctx) - client.Close() - return nil, nil, err - } - - // Construct the filesystem object. - devMinor, err := vfsObj.GetAnonBlockDevMinor() - if err != nil { - attachFile.close(ctx) - client.Close() + fs.vfsfs.DecRef(ctx) return nil, nil, err } - fs := &filesystem{ - mfp: mfp, - opts: fsopts, - iopts: iopts, - client: client, - clock: ktime.RealtimeClockFromContext(ctx), - devMinor: devMinor, - syncableDentries: make(map[*dentry]struct{}), - specialFileFDs: make(map[*specialFileFD]struct{}), - } - fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } // Set the root's reference count to 2. One reference is returned to the - // caller, and the other is deliberately leaked to prevent the root from - // being "cached" and subsequently evicted. Its resources will still be - // cleaned up by fs.Release(). + // caller, and the other is held by fs to prevent the root from being "cached" + // and subsequently evicted. root.refs = 2 + fs.root = root return &fs.vfsfs, &root.vfsd, nil } +func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { + // Check that the transport is "fd". + trans, ok := mopts["trans"] + if !ok || trans != "fd" { + ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as 'trans=fd'") + return -1, syserror.EINVAL + } + delete(mopts, "trans") + + // Check that read and write FDs are provided and identical. + rfdstr, ok := mopts["rfdno"] + if !ok { + ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as 'rfdno=<file descriptor>'") + return -1, syserror.EINVAL + } + delete(mopts, "rfdno") + rfd, err := strconv.Atoi(rfdstr) + if err != nil { + ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: rfdno=%s", rfdstr) + return -1, syserror.EINVAL + } + wfdstr, ok := mopts["wfdno"] + if !ok { + ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as 'wfdno=<file descriptor>'") + return -1, syserror.EINVAL + } + delete(mopts, "wfdno") + wfd, err := strconv.Atoi(wfdstr) + if err != nil { + ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: wfdno=%s", wfdstr) + return -1, syserror.EINVAL + } + if rfd != wfd { + ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) + return -1, syserror.EINVAL + } + return rfd, nil +} + +// Preconditions: fs.client == nil. +func (fs *filesystem) dial(ctx context.Context) error { + // Establish a connection with the server. + conn, err := unet.NewSocket(fs.opts.fd) + if err != nil { + return err + } + + // Perform version negotiation with the server. + ctx.UninterruptibleSleepStart(false) + client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + conn.Close() + return err + } + // Ownership of conn has been transferred to client. + + fs.client = client + return nil +} + // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { - ctx := context.Background() - mf := fs.mfp.MemoryFile() +func (fs *filesystem) Release(ctx context.Context) { + atomic.StoreInt32(&fs.released, 1) + mf := fs.mfp.MemoryFile() fs.syncMu.Lock() for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() - if d.handleWritable { + if h := d.writeHandleLocked(); h.isOpen() { // Write dirty cached data to the remote file. - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) } // TODO(jamieliu): Do we need to flushf/fsync d? @@ -494,9 +549,9 @@ func (fs *filesystem) Release() { d.dirty.RemoveAll() d.dataMu.Unlock() // Close the host fd if one exists. - if d.handle.fd >= 0 { - syscall.Close(int(d.handle.fd)) - d.handle.fd = -1 + if d.hostFD >= 0 { + syscall.Close(int(d.hostFD)) + d.hostFD = -1 } d.handleMu.Unlock() } @@ -505,6 +560,21 @@ func (fs *filesystem) Release() { // fs. fs.syncMu.Unlock() + // If leak checking is enabled, release all outstanding references in the + // filesystem. We deliberately avoid doing this outside of leak checking; we + // have released all external resources above rather than relying on dentry + // destructors. + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + fs.renameMu.Lock() + fs.root.releaseSyntheticRecursiveLocked(ctx) + fs.evictAllCachedDentriesLocked(ctx) + fs.renameMu.Unlock() + + // An extra reference was held by the filesystem on the root to prevent it from + // being cached/evicted. + fs.root.DecRef(ctx) + } + if !fs.iopts.LeakConnection { // Close the connection to the server. This implicitly clunks all fids. fs.client.Close() @@ -513,7 +583,34 @@ func (fs *filesystem) Release() { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } +// releaseSyntheticRecursiveLocked traverses the tree with root d and decrements +// the reference count on every synthetic dentry. Synthetic dentries have one +// reference for existence that should be dropped during filesystem.Release. +// +// Precondition: d.fs.renameMu is locked. +func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { + if d.isSynthetic() { + d.decRefLocked() + d.checkCachingLocked(ctx) + } + if d.isDir() { + var children []*dentry + d.dirMu.Lock() + for _, child := range d.children { + children = append(children, child) + } + d.dirMu.Unlock() + for _, child := range children { + if child != nil { + child.releaseSyntheticRecursiveLocked(ctx) + } + } + } +} + // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -538,14 +635,15 @@ type dentry struct { // filesystem.renameMu. name string - // We don't support hard links, so each dentry maps 1:1 to an inode. + // qidPath is the p9.QID.Path for this file. qidPath is immutable. + qidPath uint64 // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file // that does not exist on the remote filesystem. As of this writing, the // only files that can be synthetic are sockets, pipes, and directories. - file p9file + file p9file `state:"nosave"` // If deleted is non-zero, the file represented by this dentry has been // deleted. deleted is accessed using atomic memory operations. @@ -557,7 +655,7 @@ type dentry struct { cached bool dentryEntry - dirMu sync.Mutex + dirMu sync.Mutex `state:"nosave"` // If this dentry represents a directory, children contains: // @@ -581,60 +679,75 @@ type dentry struct { // returned by the server. dirents is protected by dirMu. dirents []vfs.Dirent - // Cached metadata; protected by metadataMu and accessed using atomic - // memory operations unless otherwise specified. - metadataMu sync.Mutex - ino uint64 // immutable - mode uint32 // type is immutable, perms are mutable - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - blockSize uint32 // 0 if unknown + // Cached metadata; protected by metadataMu. + // To access: + // - In situations where consistency is not required (like stat), these + // can be accessed using atomic operations only (without locking). + // - Lock metadataMu and can access without atomic operations. + // To mutate: + // - Lock metadataMu and use atomic operations to update because we might + // have atomic readers that don't hold the lock. + metadataMu sync.Mutex `state:"nosave"` + ino uint64 // immutable + mode uint32 // type is immutable, perms are mutable + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + blockSize uint32 // 0 if unknown // Timestamps, all nsecs from the Unix epoch. atime int64 mtime int64 ctime int64 btime int64 - // File size, protected by both metadataMu and dataMu (i.e. both must be - // locked to mutate it). + // File size, which differs from other metadata in two ways: + // + // - We make a best-effort attempt to keep it up to date even if + // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. + // + // - size is protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it; locking either is sufficient to access it). size uint64 + // If this dentry does not represent a synthetic file, deleted is 0, and + // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the + // remote file's timestamps, which should be updated when this dentry is + // evicted. + atimeDirty uint32 + mtimeDirty uint32 // nlink counts the number of hard links to this dentry. It's updated and // accessed using atomic operations. It's not protected by metadataMu like the // other metadata fields. nlink uint32 - mapsMu sync.Mutex + mapsMu sync.Mutex `state:"nosave"` // If this dentry represents a regular file, mappings tracks mappings of // the file into memmap.MappingSpaces. mappings is protected by mapsMu. mappings memmap.MappingSet - // If this dentry represents a regular file or directory: + // - If this dentry represents a regular file or directory, readFile is the + // p9.File used for reads by all regularFileFDs/directoryFDs representing + // this dentry. // - // - handle is the I/O handle used by all regularFileFDs/directoryFDs - // representing this dentry. + // - If this dentry represents a regular file, writeFile is the p9.File + // used for writes by all regularFileFDs representing this dentry. // - // - handleReadable is true if handle is readable. - // - // - handleWritable is true if handle is writable. - // - // Invariants: - // - // - If handleReadable == handleWritable == false, then handle.file == nil - // (i.e. there is no open handle). Conversely, if handleReadable || - // handleWritable == true, then handle.file != nil (i.e. there is an open - // handle). - // - // - handleReadable and handleWritable cannot transition from true to false - // (i.e. handles may not be downgraded). + // - If this dentry represents a regular file, hostFD is the host FD used + // for memory mappings and I/O (when applicable) in preference to readFile + // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must + // also be writable. If hostFD is -1, no such host FD is available. // // These fields are protected by handleMu. - handleMu sync.RWMutex - handle handle - handleReadable bool - handleWritable bool + // + // readFile and writeFile may or may not represent the same p9.File. Once + // either p9.File transitions from closed (isNil() == true) to open + // (isNil() == false), it may be mutated with handleMu locked, but cannot + // be closed until the dentry is destroyed. + handleMu sync.RWMutex `state:"nosave"` + readFile p9file `state:"nosave"` + writeFile p9file `state:"nosave"` + hostFD int32 `state:"nosave"` - dataMu sync.RWMutex + dataMu sync.RWMutex `state:"nosave"` // If this dentry represents a regular file that is client-cached, cache // maps offsets into the cached file to offsets into @@ -646,7 +759,7 @@ type dentry struct { // tracks dirty segments in cache. dirty is protected by dataMu. dirty fsutil.DirtySet - // pf implements platform.File for mappings of handle.fd. + // pf implements platform.File for mappings of hostFD. pf dentryPlatformFile // If this dentry represents a symbolic link, InteropModeShared is not in @@ -662,6 +775,18 @@ type dentry struct { // If this dentry represents a synthetic named pipe, pipe is the pipe // endpoint bound to this file. pipe *pipe.VFSPipe + + locks vfs.FileLocks + + // Inotify watches for this dentry. + // + // Note that inotify may behave unexpectedly in the presence of hard links, + // because dentries corresponding to the same file have separate inotify + // watches when they should share the same set. This is the case because it is + // impossible for us to know for sure whether two dentries correspond to the + // same underlying file (see the gofer filesystem section fo vfs/inotify.md for + // a more in-depth discussion on this matter). + watches vfs.Watches } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -697,15 +822,14 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma d := &dentry{ fs: fs, + qidPath: qid.Path, file: file, - ino: qid.Path, + ino: fs.inoFromQIDPath(qid.Path), mode: uint32(attr.Mode), uid: uint32(fs.opts.dfltuid), gid: uint32(fs.opts.dfltgid), blockSize: usermem.PageSize, - handle: handle{ - fd: -1, - }, + hostFD: -1, } d.pf.dentry = d if mask.UID { @@ -736,6 +860,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma d.nlink = uint32(attr.NLink) } d.vfsd.Init(d) + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Register(d, "gofer.dentry") + } fs.syncMu.Lock() fs.syncableDentries[d] = struct{}{} @@ -743,6 +870,21 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma return d, nil } +func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 { + fs.inoMu.Lock() + defer fs.inoMu.Unlock() + if ino, ok := fs.inoByQIDPath[qidPath]; ok { + return ino + } + ino := fs.nextIno() + fs.inoByQIDPath[qidPath] = ino + return ino +} + +func (fs *filesystem) nextIno() uint64 { + return atomic.AddUint64(&fs.lastIno, 1) +} + func (d *dentry) isSynthetic() bool { return d.file.isNil() } @@ -753,8 +895,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool { // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. -func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { - d.metadataMu.Lock() +// Precondition: d.metadataMu must be locked. +func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if mask.Mode { if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { d.metadataMu.Unlock() @@ -772,10 +914,12 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { if attr.BlockSize != 0 { atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) } - if mask.ATime { + // Don't override newer client-defined timestamps with old server-defined + // ones. + if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 { atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) } - if mask.MTime { + if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 { atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) } if mask.CTime { @@ -788,25 +932,33 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) } if mask.Size { - d.dataMu.Lock() - atomic.StoreUint64(&d.size, attr.Size) - d.dataMu.Unlock() + d.updateSizeLocked(attr.Size) } - d.metadataMu.Unlock() } -// Preconditions: !d.isSynthetic() +// Preconditions: !d.isSynthetic(). func (d *dentry) updateFromGetattr(ctx context.Context) error { - // Use d.handle.file, which represents a 9P fid that has been opened, in - // preference to d.file, which represents a 9P fid that has not. This may - // be significantly more efficient in some implementations. + // Use d.readFile or d.writeFile, which represent 9P fids that have been + // opened, in preference to d.file, which represents a 9P fid that has not. + // This may be significantly more efficient in some implementations. Prefer + // d.writeFile over d.readFile since some filesystem implementations may + // update a writable handle's metadata after writes to that handle, without + // making metadata updates immediately visible to read-only handles + // representing the same file. var ( file p9file handleMuRLocked bool ) + // d.metadataMu must be locked *before* we getAttr so that we do not end up + // updating stale attributes in d.updateFromP9AttrsLocked(). + d.metadataMu.Lock() + defer d.metadataMu.Unlock() d.handleMu.RLock() - if !d.handle.file.isNil() { - file = d.handle.file + if !d.writeFile.isNil() { + file = d.writeFile + handleMuRLocked = true + } else if !d.readFile.isNil() { + file = d.readFile handleMuRLocked = true } else { file = d.file @@ -819,7 +971,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { if err != nil { return err } - d.updateFromP9Attrs(attrMask, &attr) + d.updateFromP9AttrsLocked(attrMask, &attr) return nil } @@ -831,23 +983,32 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = atomic.LoadUint32(&d.blockSize) stat.Nlink = atomic.LoadUint32(&d.nlink) + if stat.Nlink == 0 { + // The remote filesystem doesn't support link count; just make + // something up. This is consistent with Linux, where + // fs/inode.c:inode_init_always() initializes link count to 1, and + // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if + // it's not provided by the remote filesystem. + stat.Nlink = 1 + } stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) - stat.Ino = d.ino + stat.Ino = uint64(d.ino) stat.Size = atomic.LoadUint64(&d.size) // This is consistent with regularFileFD.Seek(), which treats regular files // as having no holes. stat.Blocks = (stat.Size + 511) / 512 - stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime)) - stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) - stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) - stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) + stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime)) + stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime)) + stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime)) + stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime)) stat.DevMajor = linux.UNNAMED_MAJOR stat.DevMinor = d.fs.devMinor } -func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { + stat := &opts.Stat if stat.Mask == 0 { return nil } @@ -855,37 +1016,47 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() - setLocalAtime := false - setLocalMtime := false + + if stat.Mask&linux.STATX_SIZE != 0 { + // Reject attempts to truncate files other than regular files, since + // filesystem implementations may return the wrong errno. + switch mode.FileType() { + case linux.S_IFREG: + // ok + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } + + var now int64 if d.cachedMetadataAuthoritative() { - // Timestamp updates will be handled locally. - setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 - setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 - stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - - // Prepare for truncate. - if stat.Mask&linux.STATX_SIZE != 0 { - switch d.mode & linux.S_IFMT { - case linux.S_IFREG: - if !setLocalMtime { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW - } - case linux.S_IFDIR: - return syserror.EISDIR - default: - return syserror.EINVAL + // Truncate updates mtime. + if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { + stat.Mask |= linux.STATX_MTIME + stat.Mtime = linux.StatxTimestamp{ + Nsec: linux.UTIME_NOW, } } + + // Use client clocks for timestamps. + now = d.fs.clock.Now().Nanoseconds() + if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { + stat.Atime = linux.NsecToStatxTimestamp(now) + } + if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { + stat.Mtime = linux.NsecToStatxTimestamp(now) + } } + d.metadataMu.Lock() defer d.metadataMu.Unlock() if !d.isSynthetic() { @@ -911,6 +1082,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin }); err != nil { return err } + if stat.Mask&linux.STATX_SIZE != 0 { + // d.size should be kept up to date, and privatized + // copy-on-write mappings of truncated pages need to be + // invalidated, even if InteropModeShared is in effect. + d.updateSizeLocked(stat.Size) + } } if d.fs.opts.interop == InteropModeShared { // There's no point to updating d's metadata in this case since @@ -920,7 +1097,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return nil } } - now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -930,61 +1106,104 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin if stat.Mask&linux.STATX_GID != 0 { atomic.StoreUint32(&d.gid, stat.GID) } - if setLocalAtime { - if stat.Atime.Nsec == linux.UTIME_NOW { - atomic.StoreInt64(&d.atime, now) - } else { - atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) - } + // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because + // if d.cachedMetadataAuthoritative() then we converted stat.Atime and + // stat.Mtime to client-local timestamps above, and if + // !d.cachedMetadataAuthoritative() then we returned after calling + // d.file.setAttr(). For the same reason, now must have been initialized. + if stat.Mask&linux.STATX_ATIME != 0 { + atomic.StoreInt64(&d.atime, stat.Atime.ToNsec()) + atomic.StoreUint32(&d.atimeDirty, 0) } - if setLocalMtime { - if stat.Mtime.Nsec == linux.UTIME_NOW { - atomic.StoreInt64(&d.mtime, now) - } else { - atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) - } + if stat.Mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec()) + atomic.StoreUint32(&d.mtimeDirty, 0) } atomic.StoreInt64(&d.ctime, now) - if stat.Mask&linux.STATX_SIZE != 0 { + return nil +} + +// doAllocate performs an allocate operation on d. Note that d.metadataMu will +// be held when allocate is called. +func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Allocating a smaller size is a noop. + size := offset + length + if d.cachedMetadataAuthoritative() && size <= d.size { + return nil + } + + err := allocate() + if err != nil { + return err + } + d.updateSizeLocked(size) + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + +// Preconditions: d.metadataMu must be locked. +func (d *dentry) updateSizeLocked(newSize uint64) { + d.dataMu.Lock() + oldSize := d.size + atomic.StoreUint64(&d.size, newSize) + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. d.dataMu.Lock() - oldSize := d.size - d.size = stat.Size - // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings - // below. This allows concurrent calls to Read/Translate/etc. These - // functions synchronize with truncation by refusing to use cache - // contents beyond the new d.size. (We are still holding d.metadataMu, - // so we can't race with Write or another truncate.) + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) d.dataMu.Unlock() - if d.size < oldSize { - oldpgend, _ := usermem.PageRoundUp(oldSize) - newpgend, _ := usermem.PageRoundUp(d.size) - if oldpgend != newpgend { - d.mapsMu.Lock() - d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ - // Compare Linux's mm/truncate.c:truncate_setsize() => - // truncate_pagecache() => - // mm/memory.c:unmap_mapping_range(evencows=1). - InvalidatePrivate: true, - }) - d.mapsMu.Unlock() - } - // We are now guaranteed that there are no translations of - // truncated pages, and can remove them from the cache. Since - // truncated pages have been removed from the remote file, they - // should be dropped without being written back. - d.dataMu.Lock() - d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) - d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) - d.dataMu.Unlock() - } } - return nil } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { + // We only support xattrs prefixed with "user." (see b/148380782). Currently, + // there is no need to expose any other xattrs through a gofer. + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + kuid := auth.KUID(atomic.LoadUint32(&d.uid)) + kgid := auth.KGID(atomic.LoadUint32(&d.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err + } + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) +} + +func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { + return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid))) +} + func dentryUIDFromP9UID(uid p9.UID) uint32 { if !uid.Ok() { return uint32(auth.OverflowUID) @@ -1020,10 +1239,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") @@ -1039,16 +1258,40 @@ func (d *dentry) decRefLocked() { } } +// LeakMessage implements refsvfs2.CheckedObject.LeakMessage. +func (d *dentry) LeakMessage() string { + return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs)) +} + // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { + if d.isDir() { + events |= linux.IN_ISDIR + } + + d.fs.renameMu.RLock() + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) + } + d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) + d.fs.renameMu.RUnlock() +} // Watches implements vfs.DentryImpl.Watches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *dentry) Watches() *vfs.Watches { - return nil + return &d.watches +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +// +// If no watches are left on this dentry and it has no references, cache it. +func (d *dentry) OnZeroWatches(ctx context.Context) { + if atomic.LoadInt64(&d.refs) == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked(ctx) + d.fs.renameMu.Unlock() + } } // checkCachingLocked should be called after d's reference count becomes 0 or it @@ -1060,13 +1303,18 @@ func (d *dentry) Watches() *vfs.Watches { // operation. One of the calls may destroy the dentry, so subsequent calls will // do nothing. // -// Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkCachingLocked() { +// Preconditions: d.fs.renameMu must be locked for writing; it may be +// temporarily unlocked. +func (d *dentry) checkCachingLocked(ctx context.Context) { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will // remain zero while we hold renameMu for writing.) refs := atomic.LoadInt64(&d.refs) + if refs == -1 { + // Dentry has already been destroyed. + return + } if refs > 0 { if d.cached { d.fs.cachedDentries.Remove(d) @@ -1075,21 +1323,38 @@ func (d *dentry) checkCachingLocked() { } return } - if refs == -1 { - // Dentry has already been destroyed. - return - } // Deleted and invalidated dentries with zero references are no longer // reachable by path resolution and should be dropped immediately. if d.vfsd.IsDead() { + if d.isDeleted() { + d.watches.HandleDeletion(ctx) + } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } - d.destroyLocked() + d.destroyLocked(ctx) return } + // If d still has inotify watches and it is not deleted or invalidated, we + // cannot cache it and allow it to be evicted. Otherwise, we will lose its + // watches, even if a new dentry is created for the same file in the future. + // Note that the size of d.watches cannot concurrently transition from zero + // to non-zero, because adding a watch requires holding a reference on d. + if d.watches.Size() > 0 { + return + } + + if atomic.LoadInt32(&d.fs.released) != 0 { + if d.parent != nil { + d.parent.dirMu.Lock() + delete(d.parent.children, d.name) + d.parent.dirMu.Unlock() + } + d.destroyLocked(ctx) + } + // If d is already cached, just move it to the front of the LRU. if d.cached { d.fs.cachedDentries.Remove(d) @@ -1102,39 +1367,56 @@ func (d *dentry) checkCachingLocked() { d.fs.cachedDentriesLen++ d.cached = true if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries { - victim := d.fs.cachedDentries.Back() - d.fs.cachedDentries.Remove(victim) - d.fs.cachedDentriesLen-- - victim.cached = false - // victim.refs may have become non-zero from an earlier path resolution - // since it was inserted into fs.cachedDentries. - if atomic.LoadInt64(&victim.refs) == 0 { - if victim.parent != nil { - victim.parent.dirMu.Lock() - if !victim.vfsd.IsDead() { - // Note that victim can't be a mount point (in any mount - // namespace), since VFS holds references on mount points. - d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) - delete(victim.parent.children, victim.name) - // We're only deleting the dentry, not the file it - // represents, so we don't need to update - // victimParent.dirents etc. - } - victim.parent.dirMu.Unlock() - } - victim.destroyLocked() - } + d.fs.evictCachedDentryLocked(ctx) // Whether or not victim was destroyed, we brought fs.cachedDentriesLen // back down to fs.opts.maxCachedDentries, so we don't loop. } } -// destroyLocked destroys the dentry. It may flushes dirty pages from cache, -// close p9 file and remove reference on parent dentry. +// Precondition: fs.renameMu must be locked for writing; it may be temporarily +// unlocked. +func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { + for fs.cachedDentriesLen != 0 { + fs.evictCachedDentryLocked(ctx) + } +} + +// Preconditions: +// * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// * fs.cachedDentriesLen != 0. +func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { + victim := fs.cachedDentries.Back() + fs.cachedDentries.Remove(victim) + fs.cachedDentriesLen-- + victim.cached = false + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) == 0 { + if victim.parent != nil { + victim.parent.dirMu.Lock() + if !victim.vfsd.IsDead() { + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) + delete(victim.parent.children, victim.name) + // We're only deleting the dentry, not the file it + // represents, so we don't need to update + // victimParent.dirents etc. + } + victim.parent.dirMu.Unlock() + } + victim.destroyLocked(ctx) + } +} + +// destroyLocked destroys the dentry. // -// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is -// not a child dentry. -func (d *dentry) destroyLocked() { +// Preconditions: +// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked. +// * d.refs == 0. +// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal +// from its former parent dentry. +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -1145,43 +1427,76 @@ func (d *dentry) destroyLocked() { panic("dentry.destroyLocked() called with references on the dentry") } - ctx := context.Background() + // Allow the following to proceed without renameMu locked to improve + // scalability. + d.fs.renameMu.Unlock() + + mf := d.fs.mfp.MemoryFile() d.handleMu.Lock() - if !d.handle.file.isNil() { - mf := d.fs.mfp.MemoryFile() - d.dataMu.Lock() + d.dataMu.Lock() + if h := d.writeHandleLocked(); h.isOpen() { // Write dirty pages back to the remote filesystem. - if d.handleWritable { - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) - } + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) } - // Discard cached data. + } + // Discard cached data. + if !d.cache.IsEmpty() { + mf.MarkAllUnevictable(d) d.cache.DropAll(mf) d.dirty.RemoveAll() - d.dataMu.Unlock() - // Clunk open fids and close open host FDs. - d.handle.close(ctx) + } + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + if !d.readFile.isNil() { + d.readFile.close(ctx) + } + if !d.writeFile.isNil() && d.readFile != d.writeFile { + d.writeFile.close(ctx) + } + d.readFile = p9file{} + d.writeFile = p9file{} + if d.hostFD >= 0 { + syscall.Close(int(d.hostFD)) + d.hostFD = -1 } d.handleMu.Unlock() if !d.file.isNil() { - d.file.close(ctx) + // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, + // i.e. client and server timestamps may differ (because e.g. a client + // write was serviced by the page cache, and only written back to the + // remote file later). Ideally, we'd write client timestamps back to + // the remote filesystem so that timestamps for a new dentry + // instantiated for the same file would remain coherent. Unfortunately, + // this turns out to be too expensive in many cases, so for now we + // don't do this. + if err := d.file.close(ctx); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) + } d.file = p9file{} + // Remove d from the set of syncable dentries. d.fs.syncMu.Lock() delete(d.fs.syncableDentries, d) d.fs.syncMu.Unlock() } + + d.fs.renameMu.Lock() + // Drop the reference held by d on its parent without recursively locking // d.fs.renameMu. if d.parent != nil { if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkCachingLocked() + d.parent.checkCachingLocked(ctx) } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } } + + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Unregister(d, "gofer.dentry") + } } func (d *dentry) isDeleted() bool { @@ -1192,10 +1507,8 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -// We only support xattrs prefixed with "user." (see b/148380782). Currently, -// there is no need to expose any other xattrs through a gofer. -func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { - if d.file.isNil() { +func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + if d.file.isNil() || !d.userXattrSupported() { return nil, nil } xattrMap, err := d.file.listXattr(ctx, size) @@ -1204,6 +1517,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui } xattrs := make([]string, 0, len(xattrMap)) for x := range xattrMap { + // We only support xattrs in the user.* namespace. if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { xattrs = append(xattrs, x) } @@ -1211,125 +1525,166 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui return xattrs, nil } -func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { +func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { if d.file.isNil() { return "", syserror.ENODATA } - if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return "", syserror.EOPNOTSUPP - } return d.file.getXattr(ctx, opts.Name, opts.Size) } -func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { +func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { if d.file.isNil() { return syserror.EPERM } - if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP - } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } -func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { +func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { if d.file.isNil() { return syserror.EPERM } - if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP - } return d.file.removeXattr(ctx, name) } -// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory(). +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (d *dentry) userXattrSupported() bool { + filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType() + return filetype == linux.ModeRegular || filetype == linux.ModeDirectory +} + +// Preconditions: +// * !d.isSynthetic(). +// * d.isRegularFile() || d.isDir(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). if !trunc { d.handleMu.RLock() - if (!read || d.handleReadable) && (!write || d.handleWritable) { - // The current handle is sufficient. + if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) { + // Current handles are sufficient. d.handleMu.RUnlock() return nil } d.handleMu.RUnlock() } - haveOldFD := false + fdToClose := int32(-1) + invalidateTranslations := false d.handleMu.Lock() - if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { - // Get a new handle. - wantReadable := d.handleReadable || read - wantWritable := d.handleWritable || write - h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc { + // Get a new handle. If this file has been opened for both reading and + // writing, try to get a single handle that is usable for both: + // + // - Writable memory mappings of a host FD require that the host FD is + // opened for both reading and writing. + // + // - NOTE(b/141991141): Some filesystems may not ensure coherence + // between multiple handles for the same file. + openReadable := !d.readFile.isNil() || read + openWritable := !d.writeFile.isNil() || write + h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) + if err == syserror.EACCES && (openReadable != read || openWritable != write) { + // It may not be possible to use a single handle for both + // reading and writing, since permissions on the file may have + // changed to e.g. disallow reading after previously being + // opened for reading. In this case, we have no choice but to + // use separate handles for reading and writing. + ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) + openReadable = read + openWritable = write + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } if err != nil { d.handleMu.Unlock() return err } - if !d.handle.file.isNil() { - // Check that old and new handles are compatible: If the old handle - // includes a host file descriptor but the new one does not, or - // vice versa, old and new memory mappings may be incoherent. - haveOldFD = d.handle.fd >= 0 - haveNewFD := h.fd >= 0 - if haveOldFD != haveNewFD { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) - h.close(ctx) - return syserror.EIO - } - if haveOldFD { - // We may have raced with callers of d.pf.FD() that are now - // using the old file descriptor, preventing us from safely - // closing it. We could handle this by invalidating existing - // memmap.Translations, but this is expensive. Instead, use - // dup3 to make the old file descriptor refer to the new file - // description, then close the new file descriptor (which is no - // longer needed). Racing callers may use the old or new file - // description, but this doesn't matter since they refer to the - // same file (unless d.fs.opts.overlayfsStaleRead is true, - // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { + + if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) { + // We have no existing FD, and the new FD meets the requirements + // for d.hostFD, so start using it. + d.hostFD = h.fd + } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable { + // We have an existing read-only FD, but the file has just been + // opened for writing, so we need to start supporting writable memory + // mappings. This may race with callers of d.pf.FD() using the existing + // FD, so in most cases we need to delay closing the old FD until after + // invalidating memmap.Translations that might have observed it. + if !openReadable || h.fd < 0 { + // We don't have a read/write FD, so we have no FD that can be + // used to create writable memory mappings. Switch to using the + // internal page cache. + invalidateTranslations = true + fdToClose = d.hostFD + d.hostFD = -1 + } else if d.fs.opts.overlayfsStaleRead { + // We do have a read/write FD, but it may not be coherent with + // the existing read-only FD, so we must switch to mappings of + // the new FD in both the application and sentry. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) h.close(ctx) return err } - syscall.Close(int(h.fd)) - h.fd = d.handle.fd - if d.fs.opts.overlayfsStaleRead { - // Replace sentry mappings of the old FD with mappings of - // the new FD, since the two are not necessarily coherent. - if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) - h.close(ctx) - return err - } + invalidateTranslations = true + fdToClose = d.hostFD + d.hostFD = h.fd + } else { + // We do have a read/write FD. To avoid invalidating existing + // memmap.Translations (which is expensive), use dup3 to make + // the old file descriptor refer to the new file description, + // then close the new file descriptor (which is no longer + // needed). Racing callers of d.pf.FD() may use the old or new + // file description, but this doesn't matter since they refer + // to the same file, and any racing mappings must be read-only. + if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil { + oldHostFD := d.hostFD + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err) + h.close(ctx) + return err } - // Clunk the old fid before making the new handle visible (by - // unlocking d.handleMu). - d.handle.file.close(ctx) + fdToClose = h.fd } + } else { + // h.fd is not useful. + fdToClose = h.fd + } + + // Switch to new fids. + var oldReadFile p9file + if openReadable { + oldReadFile = d.readFile + d.readFile = h.file + } + var oldWriteFile p9file + if openWritable { + oldWriteFile = d.writeFile + d.writeFile = h.file + } + // NOTE(b/141991141): Clunk old fids before making new fids visible (by + // unlocking d.handleMu). + if !oldReadFile.isNil() { + oldReadFile.close(ctx) + } + if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { + oldWriteFile.close(ctx) } - // Switch to the new handle. - d.handle = h - d.handleReadable = wantReadable - d.handleWritable = wantWritable } d.handleMu.Unlock() - if d.fs.opts.overlayfsStaleRead && haveOldFD { - // Invalidate application mappings that may be using the old FD; they + if invalidateTranslations { + // Invalidate application mappings that may be using an old FD; they // will be replaced with mappings using the new FD after future calls // to d.Translate(). This requires holding d.mapsMu, which precedes // d.handleMu in the lock order. @@ -1337,35 +1692,109 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool d.mappings.InvalidateAll(memmap.InvalidateOpts{}) d.mapsMu.Unlock() } + if fdToClose >= 0 { + syscall.Close(int(fdToClose)) + } + + return nil +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) readHandleLocked() handle { + return handle{ + file: d.readFile, + fd: d.hostFD, + } +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) writeHandleLocked() handle { + return handle{ + file: d.writeFile, + fd: d.hostFD, + } +} + +func (d *dentry) syncRemoteFile(ctx context.Context) error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.syncRemoteFileLocked(ctx) +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { + // If we have a host FD, fsyncing it is likely to be faster than an fsync + // RPC. + if d.hostFD >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(d.hostFD)) + ctx.UninterruptibleSleepFinish(false) + return err + } + if !d.writeFile.isNil() { + return d.writeFile.fsync(ctx) + } + if !d.readFile.isNil() { + return d.readFile.fsync(ctx) + } + return nil +} +func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() + if h.isOpen() { + // Write back dirty pages to the remote file. + d.dataMu.Lock() + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) + d.dataMu.Unlock() + if err != nil { + return err + } + } + if err := d.syncRemoteFileLocked(ctx); err != nil { + if !forFilesystemSync { + return err + } + // Only return err if we can reasonably have expected sync to succeed + // (d is a regular file and was opened for writing). + if d.isRegularFile() && h.isOpen() { + return err + } + ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) + } return nil } // incLinks increments link count. -// -// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32. func (d *dentry) incLinks() { - v := atomic.AddUint32(&d.nlink, 1) - if v < 2 { - panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v)) + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return } + atomic.AddUint32(&d.nlink, 1) } // decLinks decrements link count. -// -// Preconditions: d.nlink > 1. func (d *dentry) decLinks() { - v := atomic.AddUint32(&d.nlink, ^uint32(0)) - if v == 0 { - panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v)) + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return } + atomic.AddUint32(&d.nlink, ^uint32(0)) } // fileDescription is embedded by gofer implementations of // vfs.FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD + + lockLogging sync.Once `state:"nosave"` } func (fd *fileDescription) filesystem() *filesystem { @@ -1394,25 +1823,62 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()) + if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil { + return err + } + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) + } + return nil +} + +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size) +} + +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) } -// Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { - return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { + d := fd.dentry() + if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { + d := fd.dentry() + if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } -// Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { - return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("File lock using gofer file handled internally.") + }) + return fd.LockFD.LockBSD(ctx, uid, t, block) } -// Setxattr implements vfs.FileDescriptionImpl.Setxattr. -func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts) +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("Range lock using gofer file handled internally.") + }) + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) } -// Removexattr implements vfs.FileDescriptionImpl.Removexattr. -func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index adff39490..76f08e252 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -20,18 +20,21 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) func TestDestroyIdempotent(t *testing.T) { + ctx := contexttest.Context(t) fs := filesystem{ - syncableDentries: make(map[*dentry]struct{}), + mfp: pgalloc.MemoryFileProviderFromContext(ctx), opts: filesystemOptions{ // Test relies on no dentry being held in the cache. maxCachedDentries: 0, }, + syncableDentries: make(map[*dentry]struct{}), + inoByQIDPath: make(map[uint64]uint64), } - ctx := contexttest.Context(t) attr := &p9.Attr{ Mode: p9.ModeRegular, } @@ -50,7 +53,9 @@ func TestDestroyIdempotent(t *testing.T) { } parent.cacheNewChildLocked(child, "child") - child.checkCachingLocked() + fs.renameMu.Lock() + defer fs.renameMu.Unlock() + child.checkCachingLocked(ctx) if got := atomic.LoadInt64(&child.refs); got != -1 { t.Fatalf("child.refs=%d, want: -1", got) } @@ -58,6 +63,6 @@ func TestDestroyIdempotent(t *testing.T) { if got := atomic.LoadInt64(&parent.refs); got != -1 { t.Fatalf("parent.refs=%d, want: -1", got) } - child.checkCachingLocked() - child.checkCachingLocked() + child.checkCachingLocked(ctx) + child.checkCachingLocked(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 724a3f1f7..a9ebe1206 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -25,6 +25,8 @@ import ( // handle represents a remote "open file descriptor", consisting of an opened // fid (p9.File) and optionally a host file descriptor. +// +// These are explicitly not savable. type handle struct { file p9file fd int32 // -1 if unavailable @@ -63,6 +65,10 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand }, nil } +func (h *handle) isOpen() bool { + return !h.file.isNil() +} + func (h *handle) close(ctx context.Context) { h.file.close(ctx) h.file = p9file{} @@ -124,13 +130,3 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } return cp, cperr } - -func (h *handle) sync(ctx context.Context) error { - if h.fd >= 0 { - ctx.UninterruptibleSleepStart(false) - err := syscall.Fsync(int(h.fd)) - ctx.UninterruptibleSleepFinish(false) - return err - } - return h.file.fsync(ctx) -} diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go index 7294de7d6..c7bf10007 100644 --- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -51,8 +51,24 @@ func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error { if ok { return nil } - if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { - return err + if sleepErr := sleepBetweenNamedPipeOpenChecks(ctx); sleepErr != nil { + // Another application thread may have opened this pipe for + // writing, succeeded because we previously opened the pipe for + // reading, and subsequently interrupted us for checkpointing (e.g. + // this occurs in mknod tests under cooperative save/restore). In + // this case, our open has to succeed for the checkpoint to include + // a readable FD for the pipe, which is in turn necessary to + // restore the other thread's writable FD for the same pipe + // (otherwise it will get ENXIO). So we have to check + // nonblockingPipeHasWriter() once last time. + ok, err := nonblockingPipeHasWriter(fd) + if err != nil { + return err + } + if ok { + return nil + } + return sleepErr } } } diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 87f0b877f..21b4a96fe 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error { return err } +func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetAttrClose(valid, attr) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { ctx.UninterruptibleSleepStart(false) fdobj, qid, iounit, err := f.file.Open(flags) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 0d10cf7ac..dc8a890cb 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -18,19 +18,19 @@ import ( "fmt" "io" "math" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -39,16 +39,17 @@ func (d *dentry) isRegularFile() bool { return d.fileType() == linux.S_IFREG } +// +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is protected by mu. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { } // OnClose implements vfs.FileDescriptionImpl.OnClose. @@ -56,15 +57,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { if !fd.vfsfd.IsWritable() { return nil } - // Skip flushing if writes may be buffered by the client, since (as with - // the VFS1 client) we don't flush buffered writes on close anyway. + // Skip flushing if there are client-buffered writes, since (as with the + // VFS1 client) we don't flush buffered writes on close anyway. d := fd.dentry() - if d.fs.opts.interop == InteropModeExclusive { + if d.fs.opts.interop != InteropModeExclusive { + return nil + } + d.dataMu.RLock() + haveDirtyPages := !d.dirty.IsEmpty() + d.dataMu.RUnlock() + if haveDirtyPages { return nil } d.handleMu.RLock() defer d.handleMu.RUnlock() - return d.handle.file.flush(ctx) + if d.writeFile.isNil() { + return nil + } + return d.writeFile.flush(ctx) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + d := fd.dentry() + return d.doAllocate(ctx, offset, length, func() error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + }) } // PRead implements vfs.FileDescriptionImpl.PRead. @@ -72,17 +92,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs if offset < 0 { return 0, syserror.EINVAL } - if opts.Flags != 0 { + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } // Check for reading at EOF before calling into MM (but not under // InteropModeShared, which makes d.size unreliable). d := fd.dentry() - if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) { return 0, io.EOF } + var ( + n int64 + readErr error + ) if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Lock d.metadataMu for the rest of the read to prevent d.size from // changing. @@ -93,20 +121,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { return 0, err } - } - - rw := getDentryReadWriter(ctx, d, offset) - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + rw := getDentryReadWriter(ctx, d, offset) // Require the read to go to the remote file. rw.direct = true + n, readErr = dst.CopyOutFrom(ctx, rw) + putDentryReadWriter(rw) + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + d.touchAtimeLocked(fd.vfsfd.Mount()) + } + } else { + rw := getDentryReadWriter(ctx, d, offset) + n, readErr = dst.CopyOutFrom(ctx, rw) + putDentryReadWriter(rw) + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + d.touchAtime(fd.vfsfd.Mount()) + } } - n, err := dst.CopyOutFrom(ctx, rw) - putDentryReadWriter(rw) - if d.fs.opts.interop != InteropModeShared { - // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). - d.touchAtime(fd.vfsfd.Mount()) - } - return n, err + return n, readErr } // Read implements vfs.FileDescriptionImpl.Read. @@ -120,85 +153,134 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset, error. The final +// offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } - if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, offset, syserror.EOPNOTSUPP + } + + d := fd.dentry() + // If the fd was opened with O_APPEND, make sure the file size is updated. + // There is a possible race here if size is modified externally after + // metadata cache is updated. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, offset, err + } + } + + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Set offset to file size if the fd was opened with O_APPEND. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Holding d.metadataMu is sufficient for reading d.size. + offset = int64(d.size) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(limit) - d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). d.touchCMtimeLocked() } - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Write dirty cached pages that will be touched by the write back to - // the remote file. - if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err - } - // Remove touched pages from the cache. - pgstart := usermem.PageRoundDown(uint64(offset)) - pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) - if !ok { - return 0, syserror.EINVAL - } - mr := memmap.MappableRange{pgstart, pgend} - var freed []platform.FileRange - d.dataMu.Lock() - cseg := d.cache.LowerBoundSegment(mr.Start) - for cseg.Ok() && cseg.Start() < mr.End { - cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) - cseg = d.cache.Remove(cseg).NextSegment() - } - d.dataMu.Unlock() - // Invalidate mappings of removed pages. - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() - // Finally free pages removed from the cache. - mf := d.fs.mfp.MemoryFile() - for _, freedFR := range freed { - mf.DecRef(freedFR) - } - } + rw := getDentryReadWriter(ctx, d, offset) + defer putDentryReadWriter(rw) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + if err := fd.writeCache(ctx, d, offset, src); err != nil { + return 0, offset, err + } + // Require the write to go to the remote file. rw.direct = true } + n, err := src.CopyInTo(ctx, rw) - putDentryReadWriter(rw) - if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + if err != nil { + return n, offset + n, err + } + if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Note that if any of the following fail, then we can't guarantee that + // any data was actually written with the semantics of O_DSYNC or + // O_SYNC, so we return zero bytes written. Compare Linux's + // mm/filemap.c:generic_file_write_iter() => + // include/linux/fs.h:generic_write_sync(). + // // Write dirty cached pages touched by the write back to the remote // file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err + return 0, offset, err } // Request the remote filesystem to sync the remote file. - if err := d.handle.file.fsync(ctx); err != nil { - return 0, err + if err := d.syncRemoteFile(ctx); err != nil { + return 0, offset, err } } - return n, err + return n, offset + n, nil +} + +func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return err + } + + // Remove touched pages from the cache. + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { + return syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []memmap.FileRange + + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + return nil } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.mu.Unlock() return n, err } @@ -241,10 +323,11 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from - // dentry.handle without locking dentry.dataMu. + // dentry.readHandleLocked() without locking dentry.dataMu. rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + h := rw.d.readHandleLocked() + if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.d.handleMu.RUnlock() rw.off += n return n, err @@ -312,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) End: gapEnd, } optMR := gap.Range() - err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.d.cache.Find(rw.off) if !seg.Ok() { @@ -320,14 +403,14 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) rw.d.handleMu.RUnlock() return done, err } - // err might have occurred in part of gap.Range() outside - // gapMR. Forget about it for now; if the error matters and - // persists, we'll run into it again in a later iteration of - // this loop. + // err might have occurred in part of gap.Range() outside gapMR + // (in particular, gap.End() might be beyond EOF). Forget about + // it for now; if the error matters and persists, we'll run + // into it again in a later iteration of this loop. } else { // Read directly from the file. gapDsts := dsts.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) done += n rw.off += n dsts = dsts.DropFirst64(n) @@ -359,11 +442,12 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents), or if the file was - // opened with O_DIRECT, write directly to dentry.handle without locking - // dentry.dataMu. + // opened with O_DIRECT, write directly to dentry.writeHandleLocked() + // without locking dentry.dataMu. rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + h := rw.d.writeHandleLocked() + if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n rw.d.dataMu.Lock() if rw.off > rw.d.size { @@ -425,7 +509,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // for detecting or avoiding this. gapMR := gap.Range().Intersect(mr) gapSrcs := srcs.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) done += n rw.off += n srcs = srcs.DropFirst64(n) @@ -451,7 +535,7 @@ exitLoop: if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, - }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 @@ -469,6 +553,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). @@ -482,22 +567,31 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), - }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() + newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) + if err != nil { + return 0, err + } + fd.off = newOffset + return newOffset, nil +} + +// Calculate the new offset for a seek operation on a regular file. +func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { switch whence { case linux.SEEK_SET: // Use offset as specified. case linux.SEEK_CUR: - offset += fd.off + offset += fdOffset case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: // Ensure file size is up to date. - d := fd.dentry() - if fd.filesystem().opts.interop == InteropModeShared { + if !d.cachedMetadataAuthoritative() { if err := d.updateFromGetattr(ctx); err != nil { return 0, err } @@ -525,31 +619,12 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( if offset < 0 { return 0, syserror.EINVAL } - fd.off = offset return offset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncSharedHandle(ctx) -} - -func (d *dentry) syncSharedHandle(ctx context.Context) error { - d.handleMu.RLock() - if !d.handleWritable { - d.handleMu.RUnlock() - return nil - } - d.dataMu.Lock() - // Write dirty cached data to the remote file. - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) - d.dataMu.Unlock() - if err == nil { - // Sync the remote file. - err = d.handle.sync(ctx) - } - d.handleMu.RUnlock() - return err + return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. @@ -573,7 +648,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt return syserror.ENODEV } d.handleMu.RLock() - haveFD := d.handle.fd >= 0 + haveFD := d.hostFD >= 0 d.handleMu.RUnlock() if !haveFD { return syserror.ENODEV @@ -594,7 +669,7 @@ func (d *dentry) mayCachePages() bool { return true } d.handleMu.RLock() - haveFD := d.handle.fd >= 0 + haveFD := d.hostFD >= 0 d.handleMu.RUnlock() return haveFD } @@ -652,7 +727,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { d.handleMu.RLock() - if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + if d.hostFD >= 0 && !d.fs.opts.forcePageCache { d.handleMu.RUnlock() mr := optional if d.fs.opts.limitHostFDTranslation { @@ -688,7 +763,8 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab } mf := d.fs.mfp.MemoryFile() - cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + h := d.readHandleLocked() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 @@ -747,7 +823,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. d.mapsMu.Lock() @@ -757,9 +833,12 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. mf := d.fs.mfp.MemoryFile() + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { return err } @@ -774,20 +853,23 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Evict implements pgalloc.EvictableMemoryUser.Evict. func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() d.mapsMu.Lock() defer d.mapsMu.Unlock() + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() - mr := memmap.MappableRange{er.Start, er.End} - mf := d.fs.mfp.MemoryFile() // Only allow pages that are no longer memory-mapped to be evicted. for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } - if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } d.cache.Drop(mgapMR, mf) @@ -795,53 +877,53 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { } } -// dentryPlatformFile implements platform.File. It exists solely because dentry -// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// dentryPlatformFile implements memmap.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file -// is available (i.e. dentry.handle.fd >= 0), and that FD is used for -// application memory mappings (i.e. !filesystem.opts.forcePageCache). +// is available (i.e. dentry.hostFD >= 0), and that FD is used for application +// memory mappings (i.e. !filesystem.opts.forcePageCache). +// +// +stateify savable type dentryPlatformFile struct { *dentry - // fdRefs counts references on platform.File offsets. fdRefs is protected + // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. fdRefs fsutil.FrameRefSet - // If this dentry represents a regular file, and handle.fd >= 0, - // hostFileMapper caches mappings of handle.fd. + // If this dentry represents a regular file, and dentry.hostFD >= 0, + // hostFileMapper caches mappings of dentry.hostFD. hostFileMapper fsutil.HostFileMapper // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. - hostFileMapperInitOnce sync.Once + hostFileMapperInitOnce sync.Once `state:"nosave"` } -// IncRef implements platform.File.IncRef. -func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr) d.dataMu.Unlock() } -// DecRef implements platform.File.DecRef. -func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. -func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() - bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) - d.handleMu.RUnlock() - return bs, err + defer d.handleMu.RUnlock() + return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() - fd := d.handle.fd - d.handleMu.RUnlock() - return int(fd) + defer d.handleMu.RUnlock() + return int(d.hostFD) } diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go new file mode 100644 index 000000000..2ea224c43 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -0,0 +1,329 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/refsvfs2" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type saveRestoreContextID int + +const ( + // CtxRestoreServerFDMap is a Context.Value key for a map[string]int + // mapping filesystem unique IDs (cf. InternalFilesystemOptions.UniqueID) + // to host FDs. + CtxRestoreServerFDMap saveRestoreContextID = iota +) + +// +stateify savable +type savedDentryRW struct { + read bool + write bool +} + +// PreprareSave implements vfs.FilesystemImplSaveRestoreExtension.PrepareSave. +func (fs *filesystem) PrepareSave(ctx context.Context) error { + if len(fs.iopts.UniqueID) == 0 { + return fmt.Errorf("gofer.filesystem with no UniqueID cannot be saved") + } + + // Purge cached dentries, which may not be reopenable after restore due to + // permission changes. + fs.renameMu.Lock() + fs.evictAllCachedDentriesLocked(ctx) + fs.renameMu.Unlock() + + // Buffer pipe data so that it's available for reading after restore. (This + // is a legacy VFS1 feature.) + fs.syncMu.Lock() + for sffd := range fs.specialFileFDs { + if sffd.dentry().fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() { + if err := sffd.savePipeData(ctx); err != nil { + fs.syncMu.Unlock() + return err + } + } + } + fs.syncMu.Unlock() + + // Flush local state to the remote filesystem. + if err := fs.Sync(ctx); err != nil { + return err + } + + fs.savedDentryRW = make(map[*dentry]savedDentryRW) + return fs.root.prepareSaveRecursive(ctx) +} + +// Preconditions: +// * fd represents a pipe. +// * fd is readable. +func (fd *specialFileFD) savePipeData(ctx context.Context) error { + fd.bufMu.Lock() + defer fd.bufMu.Unlock() + var buf [usermem.PageSize]byte + for { + n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), ^uint64(0)) + if n != 0 { + fd.buf = append(fd.buf, buf[:n]...) + } + if err != nil { + if err == io.EOF || err == syserror.EAGAIN { + break + } + return err + } + } + if len(fd.buf) != 0 { + atomic.StoreUint32(&fd.haveBuf, 1) + } + return nil +} + +func (d *dentry) prepareSaveRecursive(ctx context.Context) error { + if d.isRegularFile() && !d.cachedMetadataAuthoritative() { + // Get updated metadata for d in case we need to perform metadata + // validation during restore. + if err := d.updateFromGetattr(ctx); err != nil { + return err + } + } + if !d.readFile.isNil() || !d.writeFile.isNil() { + d.fs.savedDentryRW[d] = savedDentryRW{ + read: !d.readFile.isNil(), + write: !d.writeFile.isNil(), + } + } + d.dirMu.Lock() + defer d.dirMu.Unlock() + for _, child := range d.children { + if child != nil { + if err := child.prepareSaveRecursive(ctx); err != nil { + return err + } + } + } + return nil +} + +// beforeSave is invoked by stateify. +func (d *dentry) beforeSave() { + if d.vfsd.IsDead() { + panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: deleted and invalidated dentries can't be restored", genericDebugPathname(d))) + } +} + +// afterLoad is invoked by stateify. +func (d *dentry) afterLoad() { + d.hostFD = -1 + if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 { + refsvfs2.Register(d, "gofer.dentry") + } +} + +// afterLoad is invoked by stateify. +func (d *dentryPlatformFile) afterLoad() { + if d.hostFileMapper.IsInited() { + // Ensure that we don't call d.hostFileMapper.Init() again. + d.hostFileMapperInitOnce.Do(func() {}) + } +} + +// afterLoad is invoked by stateify. +func (fd *specialFileFD) afterLoad() { + fd.handle.fd = -1 +} + +// CompleteRestore implements +// vfs.FilesystemImplSaveRestoreExtension.CompleteRestore. +func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRestoreOptions) error { + fdmapv := ctx.Value(CtxRestoreServerFDMap) + if fdmapv == nil { + return fmt.Errorf("no server FD map available") + } + fdmap := fdmapv.(map[string]int) + fd, ok := fdmap[fs.iopts.UniqueID] + if !ok { + return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID) + } + fs.opts.fd = fd + if err := fs.dial(ctx); err != nil { + return err + } + fs.inoByQIDPath = make(map[uint64]uint64) + + // Restore the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := fs.client.Attach(fs.opts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + return err + } + if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil { + return err + } + + // Restore remaining dentries. + if err := fs.root.restoreDescendantsRecursive(ctx, &opts); err != nil { + return err + } + + // Re-open handles for specialFileFDs. Unlike the initial open + // (dentry.openSpecialFile()), pipes are always opened without blocking; + // non-readable pipe FDs are opened last to ensure that they don't get + // ENXIO if another specialFileFD represents the read end of the same pipe. + // This is consistent with VFS1. + haveWriteOnlyPipes := false + for fd := range fs.specialFileFDs { + if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { + haveWriteOnlyPipes = true + continue + } + if err := fd.completeRestore(ctx); err != nil { + return err + } + } + if haveWriteOnlyPipes { + for fd := range fs.specialFileFDs { + if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() { + if err := fd.completeRestore(ctx); err != nil { + return err + } + } + } + } + + // Discard state only required during restore. + fs.savedDentryRW = nil + + return nil +} + +func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrMask p9.AttrMask, attr *p9.Attr, opts *vfs.CompleteRestoreOptions) error { + d.file = file + + // Gofers do not preserve QID across checkpoint/restore, so: + // + // - We must assume that the remote filesystem did not change in a way that + // would invalidate dentries, since we can't revalidate dentries by + // checking QIDs. + // + // - We need to associate the new QID.Path with the existing d.ino. + d.qidPath = qid.Path + d.fs.inoMu.Lock() + d.fs.inoByQIDPath[qid.Path] = d.ino + d.fs.inoMu.Unlock() + + // Check metadata stability before updating metadata. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.isRegularFile() { + if opts.ValidateFileSizes { + if !attrMask.Size { + return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d)) + } + if d.size != attr.Size { + return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, attr.Size) + } + } + if opts.ValidateFileModificationTimestamps { + if !attrMask.MTime { + return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d)) + } + if want := dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds); d.mtime != want { + return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want)) + } + } + } + if !d.cachedMetadataAuthoritative() { + d.updateFromP9AttrsLocked(attrMask, attr) + } + + if rw, ok := d.fs.savedDentryRW[d]; ok { + if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { + return err + } + } + + return nil +} + +// Preconditions: d is not synthetic. +func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { + for _, child := range d.children { + if child == nil { + continue + } + if _, ok := d.fs.syncableDentries[child]; !ok { + // child is synthetic. + continue + } + if err := child.restoreRecursive(ctx, opts); err != nil { + return err + } + } + return nil +} + +// Preconditions: d is not synthetic (but note that since this function +// restores d.file, d.file.isNil() is always true at this point, so this can +// only be detected by checking filesystem.syncableDentries). d.parent has been +// restored. +func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { + qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name) + if err != nil { + return err + } + if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil { + return err + } + return d.restoreDescendantsRecursive(ctx, opts) +} + +func (fd *specialFileFD) completeRestore(ctx context.Context) error { + d := fd.dentry() + h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) + if err != nil { + return err + } + fd.handle = h + + ftype := d.fileType() + fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0 + if fd.haveQueue { + if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil { + return err + } + } + + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index d6dbe9092..a21199eac 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -36,13 +36,12 @@ func (d *dentry) isSocket() bool { // An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() // is called and either BoundEndpoint.BidirectionalConnect or // BoundEndpoint.UnidirectionalConnect is called. +// +// +stateify savable type endpoint struct { // dentry is the filesystem dentry which produced this endpoint. dentry *dentry - // file is the p9 file that contains a single unopened fid. - file p9.File - // path is the sentry path where this endpoint is bound. path string } @@ -108,13 +107,13 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { - hostFile, err := e.file.Connect(flags) + hostFile, err := e.dentry.file.connect(ctx, flags) if err != nil { return nil, syserr.ErrConnectionRefused } @@ -129,15 +128,15 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) if serr != nil { - log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr) + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr) return nil, serr } return c, nil } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.dentry.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.dentry.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index ff6126b87..625400c0b 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -15,52 +15,73 @@ package gofer import ( - "sync" + "sync/atomic" + "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) // specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device -// special files, and (when filesystemOptions.specialRegularFiles is in effect) -// regular files. specialFileFD differs from regularFileFD by using per-FD -// handles instead of shared per-dentry handles, and never buffering I/O. +// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is +// in effect) regular files. specialFileFD differs from regularFileFD by using +// per-FD handles instead of shared per-dentry handles, and never buffering I/O. +// +// +stateify savable type specialFileFD struct { fileDescription // handle is used for file I/O. handle is immutable. - handle handle + handle handle `state:"nosave"` + + // isRegularFile is true if this FD represents a regular file which is only + // possible when filesystemOptions.regularFilesUseSpecialFileFD is in + // effect. isRegularFile is immutable. + isRegularFile bool // seekable is true if this file description represents a file for which - // file offset is significant, i.e. a regular file. seekable is immutable. + // file offset is significant, i.e. a regular file, character device or + // block device. seekable is immutable. seekable bool - // mayBlock is true if this file description represents a file for which - // queue may send I/O readiness events. mayBlock is immutable. - mayBlock bool - queue waiter.Queue + // haveQueue is true if this file description represents a file for which + // queue may send I/O readiness events. haveQueue is immutable. + haveQueue bool `state:"nosave"` + queue waiter.Queue // If seekable is true, off is the file offset. off is protected by mu. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` off int64 + + // If haveBuf is non-zero, this FD represents a pipe, and buf contains data + // read from the pipe from previous calls to specialFileFD.savePipeData(). + // haveBuf and buf are protected by bufMu. haveBuf is accessed using atomic + // memory operations. + bufMu sync.Mutex `state:"nosave"` + haveBuf uint32 + buf []byte } -func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) { ftype := d.fileType() - seekable := ftype == linux.S_IFREG - mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK + seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK + haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0 fd := &specialFileFD{ - handle: h, - seekable: seekable, - mayBlock: mayBlock, + handle: h, + isRegularFile: ftype == linux.S_IFREG, + seekable: seekable, + haveQueue: haveQueue, } - if mayBlock && h.fd >= 0 { + fd.LockFD.Init(locks) + if haveQueue { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err } @@ -69,20 +90,23 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci DenyPRead: !seekable, DenyPWrite: !seekable, }); err != nil { - if mayBlock && h.fd >= 0 { + if haveQueue { fdnotifier.RemoveFD(h.fd) } return nil, err } + d.fs.syncMu.Lock() + d.fs.specialFileFDs[fd] = struct{}{} + d.fs.syncMu.Unlock() return fd, nil } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *specialFileFD) Release() { - if fd.mayBlock && fd.handle.fd >= 0 { +func (fd *specialFileFD) Release(ctx context.Context) { + if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } - fd.handle.close(context.Background()) + fd.handle.close(ctx) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) @@ -99,7 +123,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { // Readiness implements waiter.Waitable.Readiness. func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { - if fd.mayBlock { + if fd.haveQueue { return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) } return fd.fileDescription.Readiness(mask) @@ -107,8 +131,9 @@ func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { // EventRegister implements waiter.Waitable.EventRegister. func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - if fd.mayBlock { + if fd.haveQueue { fd.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(fd.handle.fd) return } fd.fileDescription.EventRegister(e, mask) @@ -116,42 +141,82 @@ func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { - if fd.mayBlock { + if fd.haveQueue { fd.queue.EventUnregister(e) + fdnotifier.UpdateFD(fd.handle.fd) return } fd.fileDescription.EventUnregister(e) } +func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + if fd.isRegularFile { + d := fd.dentry() + return d.doAllocate(ctx, offset, length, func() error { + return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + }) + } + return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if fd.seekable && offset < 0 { return 0, syserror.EINVAL } - if opts.Flags != 0 { + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } - // Going through dst.CopyOutFrom() holds MM locks around file operations of - // unknown duration. For regularFileFD, doing so is necessary to support - // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't - // hold here since specialFileFD doesn't client-cache data. Just buffer the - // read instead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + if d := fd.dentry(); d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } + + bufN := int64(0) + if atomic.LoadUint32(&fd.haveBuf) != 0 { + var err error + fd.bufMu.Lock() + if len(fd.buf) != 0 { + var n int + n, err = dst.CopyOut(ctx, fd.buf) + dst = dst.DropFirst(n) + fd.buf = fd.buf[n:] + if len(fd.buf) == 0 { + atomic.StoreUint32(&fd.haveBuf, 0) + fd.buf = nil + } + bufN = int64(n) + if offset >= 0 { + offset += bufN + } + } + fd.bufMu.Unlock() + if err != nil { + return bufN, err + } + } + + // Going through dst.CopyOutFrom() would hold MM locks around file + // operations of unknown duration. For regularFileFD, doing so is necessary + // to support mmap due to lock ordering; MM locks precede dentry.dataMu. + // That doesn't hold here since specialFileFD doesn't client-cache data. + // Just buffer the read instead. buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) if err == syserror.EAGAIN { err = syserror.ErrWouldBlock } if n == 0 { - return 0, err + return bufN, err } if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { - return int64(cp), cperr + return bufN + int64(cp), cperr } - return int64(n), err + return bufN + int64(n), err } // Read implements vfs.FileDescriptionImpl.Read. @@ -169,35 +234,82 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset, error. The final +// offset should be ignored by PWrite. +func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if fd.seekable && offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } - if opts.Flags != 0 { - return 0, syserror.EOPNOTSUPP + + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, offset, syserror.EOPNOTSUPP } - if fd.seekable { + d := fd.dentry() + if fd.isRegularFile { + // If the regular file fd was opened with O_APPEND, make sure the file + // size is updated. There is a possible race here if size is modified + // externally after metadata cache is updated. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, offset, err + } + } + + // We need to hold the metadataMu *while* writing to a regular file. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Set offset to file size if the regular file was opened with O_APPEND. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Holding d.metadataMu is sufficient for reading d.size. + offset = int64(d.size) + } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(limit) } // Do a buffered write. See rationale in PRead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchCMtime() } buf := make([]byte, src.NumBytes()) - // Don't do partial writes if we get a partial read from src. - if _, err := src.CopyIn(ctx, buf); err != nil { - return 0, err + copied, copyErr := src.CopyIn(ctx, buf) + if copied == 0 && copyErr != nil { + // Only return the error if we didn't get any data. + return 0, offset, copyErr } - n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset)) if err == syserror.EAGAIN { err = syserror.ErrWouldBlock } - return int64(n), err + // Update offset if the offset is valid. + if offset >= 0 { + offset += int64(n) + } + // Update file size for regular files. + if fd.isRegularFile { + // d.metadataMu is already locked at this point. + if uint64(offset) > d.size { + d.dataMu.Lock() + defer d.dataMu.Unlock() + atomic.StoreUint64(&d.size, uint64(offset)) + } + } + if err != nil { + return int64(n), offset, err + } + return int64(n), offset, copyErr } // Write implements vfs.FileDescriptionImpl.Write. @@ -207,8 +319,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts } fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.mu.Unlock() return n, err } @@ -220,27 +332,41 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( } fd.mu.Lock() defer fd.mu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as given. - case linux.SEEK_CUR: - offset += fd.off - default: - // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not - // clear that file size is even meaningful for these files. - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL + newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) + if err != nil { + return 0, err } - fd.off = offset - return offset, nil + fd.off = newOffset + return newOffset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - if !fd.vfsfd.IsWritable() { - return nil + return fd.sync(ctx, false /* forFilesystemSync */) +} + +func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { + err := func() error { + // If we have a host FD, fsyncing it is likely to be faster than an fsync + // RPC. + if fd.handle.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(fd.handle.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + return fd.handle.file.fsync(ctx) + }() + if err != nil { + if !forFilesystemSync { + return err + } + // Only return err if we can reasonably have expected sync to succeed + // (fd represents a regular file that was opened for writing). + if fd.isRegularFile && fd.vfsfd.IsWritable() { + return err + } + ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err) } - return fd.handle.sync(ctx) + return nil } diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 2608e7e1d..9cbe805b9 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -17,7 +17,6 @@ package gofer import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -25,31 +24,39 @@ func dentryTimestampFromP9(s, ns uint64) int64 { return int64(s*1e9 + ns) } -func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 { - return ts.Sec*1e9 + int64(ts.Nsec) -} - -func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { - return linux.StatxTimestamp{ - Sec: ns / 1e9, - Nsec: uint32(ns % 1e9), - } -} - -// Preconditions: fs.interop != InteropModeShared. +// Preconditions: d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime || mnt.ReadOnly() { + return + } if err := mnt.CheckBeginWrite(); err != nil { return } now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) + atomic.StoreUint32(&d.atimeDirty, 1) d.metadataMu.Unlock() mnt.EndWrite() } -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). +// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true. +func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { + if mnt.Flags.NoATime || mnt.ReadOnly() { + return + } + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now := d.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&d.atime, now) + atomic.StoreUint32(&d.atimeDirty, 1) + mnt.EndWrite() +} + +// Preconditions: +// * d.cachedMetadataAuthoritative() == true. +// * The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() @@ -57,18 +64,24 @@ func (d *dentry) touchCtime() { d.metadataMu.Unlock() } -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). +// Preconditions: +// * d.cachedMetadataAuthoritative() == true. +// * The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCMtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) + atomic.StoreUint32(&d.mtimeDirty, 1) d.metadataMu.Unlock() } +// Preconditions: +// * d.cachedMetadataAuthoritative() == true. +// * The caller has locked d.metadataMu. func (d *dentry) touchCMtimeLocked() { now := d.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) + atomic.StoreUint32(&d.mtimeDirty, 1) } diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index ca0fe6d2b..dc0f86061 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -1,14 +1,40 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "host", + prefix = "inode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "inode", + }, +) + +go_template_instance( + name = "connected_endpoint_refs", + out = "connected_endpoint_refs.go", + package = "host", + prefix = "ConnectedEndpoint", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "ConnectedEndpoint", + }, +) + go_library( name = "host", srcs = [ + "connected_endpoint_refs.go", "control.go", "host.go", + "inode_refs.go", "ioctl_unsafe.go", "mmap.go", + "save_restore.go", "socket.go", "socket_iovec.go", "socket_unsafe.go", @@ -22,17 +48,20 @@ go_library( "//pkg/context", "//pkg/fdnotifier", "//pkg/fspath", + "//pkg/iovec", "//pkg/log", + "//pkg/marshal/primitive", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go index b9082a20f..13ef48cb5 100644 --- a/pkg/sentry/fsimpl/host/control.go +++ b/pkg/sentry/fsimpl/host/control.go @@ -58,7 +58,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } @@ -79,7 +79,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription { } // Create the file backed by hostFD. - file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */) + file, err := NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, &NewFDOptions{}) if err != nil { ctx.Warningf("Error creating file from host FD: %v", err) break diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 18b127521..eeed0f97d 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -19,6 +19,7 @@ package host import ( "fmt" "math" + "sync/atomic" "syscall" "golang.org/x/sys/unix" @@ -27,7 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -40,8 +41,123 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// inode implements kernfs.Inode. +// +// +stateify savable +type inode struct { + kernfs.InodeNoStatFS + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. + + locks vfs.FileLocks + + // When the reference count reaches zero, the host fd is closed. + inodeRefs + + // hostFD contains the host fd that this file was originally created from, + // which must be available at time of restore. + // + // This field is initialized at creation time and is immutable. + hostFD int + + // ino is an inode number unique within this filesystem. + // + // This field is initialized at creation time and is immutable. + ino uint64 + + // ftype is the file's type (a linux.S_IFMT mask). + // + // This field is initialized at creation time and is immutable. + ftype uint16 + + // mayBlock is true if hostFD is non-blocking, and operations on it may + // return EAGAIN or EWOULDBLOCK instead of blocking. + // + // This field is initialized at creation time and is immutable. + mayBlock bool + + // seekable is false if lseek(hostFD) returns ESPIPE. We assume that file + // offsets are meaningful iff seekable is true. + // + // This field is initialized at creation time and is immutable. + seekable bool + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool + + // savable is true if hostFD may be saved/restored by its numeric value. + // + // This field is initialized at creation time and is immutable. + savable bool + + // Event queue for blocking operations. + queue waiter.Queue + + // mapsMu protects mappings. + mapsMu sync.Mutex `state:"nosave"` + + // If this file is mmappable, mappings tracks mappings of hostFD into + // memmap.MappingSpaces. + mappings memmap.MappingSet + + // pf implements platform.File for mappings of hostFD. + pf inodePlatformFile + + // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data + // read from the pipe from previous calls to inode.beforeSave(). haveBuf + // and buf are protected by bufMu. haveBuf is accessed using atomic memory + // operations. + bufMu sync.Mutex `state:"nosave"` + haveBuf uint32 + buf []byte +} + +func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) { + // Determine if hostFD is seekable. + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) + seekable := err != syserror.ESPIPE + // We expect regular files to be seekable, as this is required for them to + // be memory-mappable. + if !seekable && fileType == syscall.S_IFREG { + ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) + return nil, syserror.ESPIPE + } + + i := &inode{ + hostFD: hostFD, + ino: fs.NextIno(), + ftype: uint16(fileType), + mayBlock: fileType != syscall.S_IFREG && fileType != syscall.S_IFDIR, + seekable: seekable, + isTTY: isTTY, + savable: savable, + } + i.pf.inode = i + i.EnableLeakCheck() + + // If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and + // handle blocking behavior in the sentry. + if i.mayBlock { + if err := syscall.SetNonblock(i.hostFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { + return nil, err + } + } + return i, nil +} + // NewFDOptions contains options to NewFD. type NewFDOptions struct { + // If Savable is true, the host file descriptor may be saved/restored by + // numeric value; the sandbox API requires a corresponding host FD with the + // same numeric value to be provieded at time of restore. + Savable bool + // If IsTTY is true, the file descriptor is a TTY. IsTTY bool @@ -75,73 +191,48 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) flags = uint32(flagsInt) } - fileMode := linux.FileMode(s.Mode) - fileType := fileMode.FileType() - - // Determine if hostFD is seekable. If not, this syscall will return ESPIPE - // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character - // devices. - _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) - seekable := err != syserror.ESPIPE - - i := &inode{ - hostFD: hostFD, - ino: fs.NextIno(), - isTTY: opts.IsTTY, - wouldBlock: wouldBlock(uint32(fileType)), - seekable: seekable, - canMap: canMap(uint32(fileType)), - } - i.pf.inode = i - - // Non-seekable files can't be memory mapped, assert this. - if !i.seekable && i.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - // If the hostFD would block, we must set it to non-blocking and handle - // blocking behavior in the sentry. - if i.wouldBlock { - if err := syscall.SetNonblock(i.hostFD, true); err != nil { - return nil, err - } - if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { - return nil, err - } - } - d := &kernfs.Dentry{} - d.Init(i) + i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY) + if err != nil { + return nil, err + } + d.Init(&fs.Filesystem, i) // i.open will take a reference on d. - defer d.DecRef() + defer d.DecRef(ctx) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, // etc.), and use the offset from the host fd otherwise when importing. - return i.open(ctx, d.VFSDentry(), mnt, flags) + return i.open(ctx, d, mnt, flags) } // ImportFD sets up and returns a vfs.FileDescription from a donated fd. func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { return NewFD(ctx, mnt, hostFD, &NewFDOptions{ - IsTTY: isTTY, + Savable: true, + IsTTY: isTTY, }) } // filesystemType implements vfs.FilesystemType. +// +// +stateify savable type filesystemType struct{} -// GetFilesystem implements FilesystemType.GetFilesystem. +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("host.filesystemType.GetFilesystem should never be called") } -// Name implements FilesystemType.Name. +// Name implements vfs.FilesystemType.Name. func (filesystemType) Name() string { return "none" } +// Release implements vfs.FilesystemType.Release. +func (filesystemType) Release(ctx context.Context) {} + // NewFilesystem sets up and returns a new hostfs filesystem. // // Note that there should only ever be one instance of host.filesystem, @@ -159,15 +250,17 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 } -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -177,63 +270,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe return vfs.PrependPathSyntheticError{} } -// inode implements kernfs.Inode. -type inode struct { - kernfs.InodeNotDirectory - kernfs.InodeNotSymlink - - // When the reference count reaches zero, the host fd is closed. - refs.AtomicRefCount - - // hostFD contains the host fd that this file was originally created from, - // which must be available at time of restore. - // - // This field is initialized at creation time and is immutable. - hostFD int - - // ino is an inode number unique within this filesystem. - // - // This field is initialized at creation time and is immutable. - ino uint64 - - // isTTY is true if this file represents a TTY. - // - // This field is initialized at creation time and is immutable. - isTTY bool - - // seekable is false if the host fd points to a file representing a stream, - // e.g. a socket or a pipe. Such files are not seekable and can return - // EWOULDBLOCK for I/O operations. - // - // This field is initialized at creation time and is immutable. - seekable bool - - // wouldBlock is true if the host FD would return EWOULDBLOCK for - // operations that would block. - // - // This field is initialized at creation time and is immutable. - wouldBlock bool - - // Event queue for blocking operations. - queue waiter.Queue - - // canMap specifies whether we allow the file to be memory mapped. - // - // This field is initialized at creation time and is immutable. - canMap bool - - // mapsMu protects mappings. - mapsMu sync.Mutex - - // If canMap is true, mappings tracks mappings of hostFD into - // memmap.MappingSpaces. - mappings memmap.MappingSet - - // pf implements platform.File for mappings of hostFD. - pf inodePlatformFile -} - -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { @@ -242,7 +279,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) } -// Mode implements kernfs.Inode. +// Mode implements kernfs.Inode.Mode. func (i *inode) Mode() linux.FileMode { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { @@ -253,8 +290,8 @@ func (i *inode) Mode() linux.FileMode { return linux.FileMode(s.Mode) } -// Stat implements kernfs.Inode. -func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL } @@ -366,9 +403,9 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { }, nil } -// SetStat implements kernfs.Inode. +// SetStat implements kernfs.Inode.SetStat. func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - s := opts.Stat + s := &opts.Stat m := s.Mask if m == 0 { @@ -381,7 +418,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Fstat(i.hostFD, &hostStat); err != nil { return err } - if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { return err } @@ -391,6 +428,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } } if m&linux.STATX_SIZE != 0 { + if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { + return syserror.EINVAL + } if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } @@ -422,31 +462,28 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre return nil } -// DecRef implements kernfs.Inode. -func (i *inode) DecRef() { - i.AtomicRefCount.DecRefWithDestructor(i.Destroy) -} - -// Destroy implements kernfs.Inode. -func (i *inode) Destroy() { - if i.wouldBlock { - fdnotifier.RemoveFD(int32(i.hostFD)) - } - if err := unix.Close(i.hostFD); err != nil { - log.Warningf("failed to close host fd %d: %v", i.hostFD, err) - } +// DecRef implements kernfs.Inode.DecRef. +func (i *inode) DecRef(ctx context.Context) { + i.inodeRefs.DecRef(func() { + if i.mayBlock { + fdnotifier.RemoveFD(int32(i.hostFD)) + } + if err := unix.Close(i.hostFD); err != nil { + log.Warningf("failed to close host fd %d: %v", i.hostFD, err) + } + }) } -// Open implements kernfs.Inode. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +// Open implements kernfs.Inode.Open. +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. if i.Mode().FileType() == linux.S_IFSOCK { return nil, syserror.ENXIO } - return i.open(ctx, vfsd, rp.Mount(), opts.Flags) + return i.open(ctx, d, rp.Mount(), opts.Flags) } -func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { +func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { return nil, err @@ -454,10 +491,12 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u fileType := s.Mode & linux.FileTypeMask // Constrain flags to a subset we can handle. - // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. - flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND + // + // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. + flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND - if fileType == syscall.S_IFSOCK { + switch fileType { + case syscall.S_IFSOCK: if i.isTTY { log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) return nil, syserror.ENOTTY @@ -468,35 +507,43 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return nil, err } // Currently, we only allow Unix sockets to be imported. - return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d) - } + return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks) - // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that - // we don't allow importing arbitrary file types without proper support. - if i.isTTY { - fd := &TTYFileDescription{ - fileDescription: fileDescription{inode: i}, - termios: linux.DefaultSlaveTermios, + case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR: + if i.isTTY { + fd := &TTYFileDescription{ + fileDescription: fileDescription{inode: i}, + termios: linux.DefaultReplicaTermios, + } + fd.LockFD.Init(&i.locks) + vfsfd := &fd.vfsfd + if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil } + + fd := &fileDescription{inode: i} + fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd - if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil - } - fd := &fileDescription{inode: i} - vfsfd := &fd.vfsfd - if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err + default: + log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) + return nil, syserror.EPERM } - return vfsfd, nil } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but // cached to reduce indirections and casting. fileDescription does not hold @@ -507,31 +554,43 @@ type fileDescription struct { inode *inode // offsetMu protects offset. - offsetMu sync.Mutex + offsetMu sync.Mutex `state:"nosave"` // offset specifies the current file offset. It is only meaningful when // inode.seekable is true. offset int64 } -// SetStat implements vfs.FileDescriptionImpl. +// SetStat implements vfs.FileDescriptionImpl.SetStat. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } -// Stat implements vfs.FileDescriptionImpl. -func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { - return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts) +// Stat implements vfs.FileDescriptionImpl.Stat. +func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) } -// Release implements vfs.FileDescriptionImpl. -func (f *fileDescription) Release() { +// Release implements vfs.FileDescriptionImpl.Release. +func (f *fileDescription) Release(context.Context) { // noop } -// PRead implements FileDescriptionImpl. +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + i := f.inode if !i.seekable { return 0, syserror.ESPIPE @@ -540,23 +599,35 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) } -// Read implements FileDescriptionImpl. +// Read implements vfs.FileDescriptionImpl.Read. func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + i := f.inode if !i.seekable { + bufN, err := i.readFromBuf(ctx, &dst) + if err != nil { + return bufN, err + } n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) + total := bufN + n if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read // rather than retrying until complete. - if n != 0 { + if total != 0 { err = nil } else { err = syserror.ErrWouldBlock } } - return n, err + return total, err } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + f.offsetMu.Lock() n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) f.offset += n @@ -564,58 +635,90 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts return n, err } -func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. - if flags != 0 { - return 0, syserror.EOPNOTSUPP +func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) { + if atomic.LoadUint32(&i.haveBuf) == 0 { + return 0, nil } + i.bufMu.Lock() + defer i.bufMu.Unlock() + if len(i.buf) == 0 { + return 0, nil + } + n, err := dst.CopyOut(ctx, i.buf) + *dst = dst.DropFirst(n) + i.buf = i.buf[n:] + if len(i.buf) == 0 { + atomic.StoreUint32(&i.haveBuf, 0) + i.buf = nil + } + return int64(n), err +} + +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := dst.CopyOutFrom(ctx, reader) hostfd.PutReadWriterAt(reader) return int64(n), err } -// PWrite implements FileDescriptionImpl. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - i := f.inode - if !i.seekable { + if !f.inode.seekable { return 0, syserror.ESPIPE } - return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags) + return f.writeToHostFD(ctx, src, offset, opts.Flags) } -// Write implements FileDescriptionImpl. +// Write implements vfs.FileDescriptionImpl.Write. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode if !i.seekable { - n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) + n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) if isBlockError(err) { err = syserror.ErrWouldBlock } return n, err } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + f.offsetMu.Lock() - n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags) + // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if + // another process modifies the host file between retrieving the file size + // and writing to the host fd. This is an unavoidable race condition because + // we cannot enforce synchronization on the host. + if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + f.offsetMu.Unlock() + return 0, err + } + f.offset = s.Size + } + n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) f.offset += n f.offsetMu.Unlock() return n, err } -func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. +func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { + hostFD := f.inode.hostFD + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if flags != 0 { return 0, syserror.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) hostfd.PutReadWriterAt(writer) + // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC. + if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + if syncErr := unix.Fsync(hostFD); syncErr != nil { + return int64(n), syncErr + } + } return int64(n), err } -// Seek implements FileDescriptionImpl. +// Seek implements vfs.FileDescriptionImpl.Seek. // // Note that we do not support seeking on directories, since we do not even // allow directory fds to be imported at all. @@ -680,15 +783,17 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i return f.offset, nil } -// Sync implements FileDescriptionImpl. -func (f *fileDescription) Sync(context.Context) error { - // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. +// Sync implements vfs.FileDescriptionImpl.Sync. +func (f *fileDescription) Sync(ctx context.Context) error { + // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. return unix.Fsync(f.inode.hostFD) } -// ConfigureMMap implements FileDescriptionImpl. +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { - if !f.inode.canMap { + // NOTE(b/38213152): Technically, some obscure char devices can be memory + // mapped, but we only allow regular files. + if f.inode.ftype != syscall.S_IFREG { return syserror.ENODEV } i := f.inode @@ -699,16 +804,30 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts // EventRegister implements waiter.Waitable.EventRegister. func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { f.inode.queue.EventRegister(e, mask) - fdnotifier.UpdateFD(int32(f.inode.hostFD)) + if f.inode.mayBlock { + fdnotifier.UpdateFD(int32(f.inode.hostFD)) + } } // EventUnregister implements waiter.Waitable.EventUnregister. func (f *fileDescription) EventUnregister(e *waiter.Entry) { f.inode.queue.EventUnregister(e) - fdnotifier.UpdateFD(int32(f.inode.hostFD)) + if f.inode.mayBlock { + fdnotifier.UpdateFD(int32(f.inode.hostFD)) + } } // Readiness uses the poll() syscall to check the status of the underlying FD. func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go index 8545a82f0..3d7eb2f96 100644 --- a/pkg/sentry/fsimpl/host/mmap.go +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -19,22 +19,23 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// inodePlatformFile implements platform.File. It exists solely because inode -// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// inodePlatformFile implements memmap.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef. // // inodePlatformFile should only be used if inode.canMap is true. +// +// +stateify savable type inodePlatformFile struct { *inode // fdRefsMu protects fdRefs. - fdRefsMu sync.Mutex + fdRefsMu sync.Mutex `state:"nosave"` - // fdRefs counts references on platform.File offsets. It is used solely for + // fdRefs counts references on memmap.File offsets. It is used solely for // memory accounting. fdRefs fsutil.FrameRefSet @@ -42,35 +43,35 @@ type inodePlatformFile struct { fileMapper fsutil.HostFileMapper // fileMapperInitOnce is used to lazily initialize fileMapper. - fileMapperInitOnce sync.Once + fileMapperInitOnce sync.Once `state:"nosave"` } -// IncRef implements platform.File.IncRef. +// IncRef implements memmap.File.IncRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) IncRef(fr platform.FileRange) { +func (i *inodePlatformFile) IncRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.IncRefAndAccount(fr) i.fdRefsMu.Unlock() } -// DecRef implements platform.File.DecRef. +// DecRef implements memmap.File.DecRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) DecRef(fr platform.FileRange) { +func (i *inodePlatformFile) DecRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.DecRefAndAccount(fr) i.fdRefsMu.Unlock() } -// MapInternal implements platform.File.MapInternal. +// MapInternal implements memmap.File.MapInternal. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (i *inodePlatformFile) FD() int { return i.hostFD } diff --git a/pkg/sentry/fsimpl/host/save_restore.go b/pkg/sentry/fsimpl/host/save_restore.go new file mode 100644 index 000000000..7e32a8863 --- /dev/null +++ b/pkg/sentry/fsimpl/host/save_restore.go @@ -0,0 +1,78 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "io" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/usermem" +) + +// beforeSave is invoked by stateify. +func (i *inode) beforeSave() { + if !i.savable { + panic("host.inode is not savable") + } + if i.ftype == syscall.S_IFIFO { + // If this pipe FD is readable, drain it so that bytes in the pipe can + // be read after restore. (This is a legacy VFS1 feature.) We don't + // know if the pipe FD is readable, so just try reading and tolerate + // EBADF from the read. + i.bufMu.Lock() + defer i.bufMu.Unlock() + var buf [usermem.PageSize]byte + for { + n, err := hostfd.Preadv2(int32(i.hostFD), safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), -1 /* offset */, 0 /* flags */) + if n != 0 { + i.buf = append(i.buf, buf[:n]...) + } + if err != nil { + if err == io.EOF || err == syscall.EAGAIN || err == syscall.EBADF { + break + } + panic(fmt.Errorf("host.inode.beforeSave: buffering from pipe failed: %v", err)) + } + } + if len(i.buf) != 0 { + atomic.StoreUint32(&i.haveBuf, 1) + } + } +} + +// afterLoad is invoked by stateify. +func (i *inode) afterLoad() { + if i.mayBlock { + if err := syscall.SetNonblock(i.hostFD, true); err != nil { + panic(fmt.Sprintf("host.inode.afterLoad: failed to set host FD %d non-blocking: %v", i.hostFD, err)) + } + if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { + panic(fmt.Sprintf("host.inode.afterLoad: fdnotifier.AddFD(%d) failed: %v", i.hostFD, err)) + } + } +} + +// afterLoad is invoked by stateify. +func (i *inodePlatformFile) afterLoad() { + if i.fileMapper.IsInited() { + // Ensure that we don't call i.fileMapper.Init() again. + i.fileMapperInitOnce.Do(func() {}) + } +} diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 38f1fbfba..8a447e29f 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" @@ -47,11 +46,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor return ep, nil } -// maxSendBufferSize is the maximum host send buffer size allowed for endpoint. -// -// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). -const maxSendBufferSize = 8 << 20 - // ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and // transport.Receiver. It is backed by a host fd that was imported at sentry // startup. This fd is shared with a hostfs inode, which retains ownership of @@ -64,8 +58,7 @@ const maxSendBufferSize = 8 << 20 // // +stateify savable type ConnectedEndpoint struct { - // ref keeps track of references to a ConnectedEndpoint. - ref refs.AtomicRefCount + ConnectedEndpointRefs // mu protects fd below. mu sync.RWMutex `state:"nosave"` @@ -114,10 +107,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error { if err != nil { return syserr.FromError(err) } - if sndbuf > maxSendBufferSize { - log.Warningf("Socket send buffer too large: %d", sndbuf) - return syserr.ErrInvalidEndpointState - } c.stype = linux.SockType(stype) c.sndbuf = int64(sndbuf) @@ -141,14 +130,14 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable return nil, err } - // AtomicRefCounters start off with a single reference. We need two. - e.ref.IncRef() - e.ref.EnableLeakCheck("host.ConnectedEndpoint") + // ConnectedEndpointRefs start off with a single reference. We need two. + e.IncRef() + e.EnableLeakCheck() return &e, nil } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -225,7 +214,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -326,8 +315,8 @@ func (c *ConnectedEndpoint) destroyLocked() { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(func() { +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.DecRef(func() { c.mu.Lock() c.destroyLocked() c.mu.Unlock() @@ -356,13 +345,13 @@ func (e *SCMConnectedEndpoint) Init() error { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (e *SCMConnectedEndpoint) Release() { - e.ref.DecRefWithDestructor(func() { +func (e *SCMConnectedEndpoint) Release(ctx context.Context) { + e.DecRef(func() { e.mu.Lock() + fdnotifier.RemoveFD(int32(e.fd)) if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) } - fdnotifier.RemoveFD(int32(e.fd)) e.destroyLocked() e.mu.Unlock() }) @@ -387,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s return nil, err } - // AtomicRefCounters start off with a single reference. We need two. - e.ref.IncRef() - e.ref.EnableLeakCheck("host.SCMConnectedEndpoint") + // ConnectedEndpointRefs start off with a single reference. We need two. + e.IncRef() + e.EnableLeakCheck() return &e, nil } diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go index 584c247d2..fc0d5fd38 100644 --- a/pkg/sentry/fsimpl/host/socket_iovec.go +++ b/pkg/sentry/fsimpl/host/socket_iovec.go @@ -17,13 +17,10 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/iovec" "gvisor.dev/gvisor/pkg/syserror" ) -// maxIovs is the maximum number of iovecs to pass to the host. -var maxIovs = linux.UIO_MAXIOV - // copyToMulti copies as many bytes from src to dst as possible. func copyToMulti(dst [][]byte, src []byte) { for _, d := range dst { @@ -74,7 +71,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec } } - if iovsRequired > maxIovs { + if iovsRequired > iovec.MaxIovs { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go index 35ded24bc..c0bf45f08 100644 --- a/pkg/sentry/fsimpl/host/socket_unsafe.go +++ b/pkg/sentry/fsimpl/host/socket_unsafe.go @@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) ( controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC if n > length { - return length, n, msg.Controllen, controlTrunc, err + return length, n, msg.Controllen, controlTrunc, nil } - return n, n, msg.Controllen, controlTrunc, err + return n, n, msg.Controllen, controlTrunc, nil } // fdWriteVec sends from bufs to fd. diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 68af6e5af..f5c596fec 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -17,7 +17,9 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -28,6 +30,8 @@ import ( // TTYFileDescription implements vfs.FileDescriptionImpl for a host file // descriptor that wraps a TTY FD. +// +// +stateify savable type TTYFileDescription struct { fileDescription @@ -66,15 +70,15 @@ func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { } // Release implements fs.FileOperations.Release. -func (t *TTYFileDescription) Release() { +func (t *TTYFileDescription) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileDescription.Release() + t.fileDescription.Release(ctx) } -// PRead implements vfs.FileDescriptionImpl. +// PRead implements vfs.FileDescriptionImpl.PRead. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. @@ -92,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, return t.fileDescription.PRead(ctx, dst, offset, opts) } -// Read implements vfs.FileDescriptionImpl. +// Read implements vfs.FileDescriptionImpl.Read. // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. @@ -110,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o return t.fileDescription.Read(ctx, dst, opts) } -// PWrite implements vfs.FileDescriptionImpl. +// PWrite implements vfs.FileDescriptionImpl.PWrite. func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -125,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, return t.fileDescription.PWrite(ctx, src, offset, opts) } -// Write implements vfs.FileDescriptionImpl. +// Write implements vfs.FileDescriptionImpl.Write. func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -140,8 +144,13 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, return t.fileDescription.Write(ctx, src, opts) } -// Ioctl implements vfs.FileDescriptionImpl. +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + // Ignore arg[0]. This is the real FD: fd := t.inode.hostFD ioctl := args[1].Uint64() @@ -151,9 +160,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch if err != nil { return 0, err } - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err = termios.CopyOut(task, args[2].Pointer()) return 0, err case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: @@ -165,9 +172,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch } var termios linux.Termios - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } err := ioctlSetTermios(fd, ioctl, &termios) @@ -191,10 +196,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch defer t.mu.Unlock() // Map the ProcessGroup into a ProcessGroupID in the task's PID namespace. - pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ - AddressSpaceActive: true, - }) + pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup)) + _, err := pgID.CopyOut(task, args[2].Pointer()) return 0, err case linux.TIOCSPGRP: @@ -202,11 +205,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch // Equivalent to tcsetpgrp(fd, *argp). // Set the foreground process group ID of this terminal. - task := kernel.TaskFromContext(ctx) - if task == nil { - return 0, syserror.ENOTTY - } - t.mu.Lock() defer t.mu.Unlock() @@ -225,12 +223,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch return 0, syserror.ENOTTY } - var pgID kernel.ProcessGroupID - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + var pgIDP primitive.Int32 + if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } + pgID := kernel.ProcessGroupID(pgIDP) // pgID must be non-negative. if pgID < 0 { @@ -259,9 +256,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch if err != nil { return 0, err } - _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err = winsize.CopyOut(task, args[2].Pointer()) return 0, err case linux.TIOCSWINSZ: @@ -272,9 +267,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch // set the winsize. var winsize linux.Winsize - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil { return 0, err } err := ioctlSetWinsize(fd, &winsize) @@ -325,9 +318,9 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but - // tty_check_change is more of a blacklist of cases than a - // whitelist, and is surprisingly permissive. Allowing the - // change seems most appropriate. + // tty_check_change only blocks specific cases and is + // surprisingly permissive. Allowing the change seems + // appropriate. return nil } @@ -375,5 +368,15 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return kernel.ERESTARTSYS + return syserror.ERESTARTSYS +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index 2bc757b1a..b2f43a119 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -43,22 +43,6 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} } -// wouldBlock returns true for file types that can return EWOULDBLOCK -// for blocking operations, e.g. pipes, character devices, and sockets. -func wouldBlock(fileType uint32) bool { - return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK -} - -// canMap returns true if a file with fileType is allowed to be memory mapped. -// This is ported over from VFS1, but it's probably not the best way for us -// to check if a file can be memory mapped. -func canMap(fileType uint32) bool { - // TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()). - // - // TODO(b/38213152): Some obscure character devices can be mapped. - return fileType == syscall.S_IFREG -} - // isBlockError checks if an error is EAGAIN or EWOULDBLOCK. // If so, they can be transformed into syserror.ErrWouldBlock. func isBlockError(err error) bool { diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index ef34cb28a..aaad67ab8 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -4,6 +4,18 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "kernfs", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Dentry", + "Linker": "*Dentry", + }, +) + +go_template_instance( name = "fstree", out = "fstree.go", package = "kernfs", @@ -26,9 +38,54 @@ go_template_instance( }, ) +go_template_instance( + name = "static_directory_refs", + out = "static_directory_refs.go", + package = "kernfs", + prefix = "StaticDirectory", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "StaticDirectory", + }, +) + +go_template_instance( + name = "dir_refs", + out = "dir_refs.go", + package = "kernfs_test", + prefix = "dir", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "dir", + }, +) + +go_template_instance( + name = "readonly_dir_refs", + out = "readonly_dir_refs.go", + package = "kernfs_test", + prefix = "readonlyDir", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "readonlyDir", + }, +) + +go_template_instance( + name = "synthetic_directory_refs", + out = "synthetic_directory_refs.go", + package = "kernfs", + prefix = "syntheticDirectory", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "syntheticDirectory", + }, +) + go_library( name = "kernfs", srcs = [ + "dentry_list.go", "dynamic_bytes_file.go", "fd_impl_util.go", "filesystem.go", @@ -36,7 +93,10 @@ go_library( "inode_impl_util.go", "kernfs.go", "slot_list.go", + "static_directory_refs.go", "symlink.go", + "synthetic_directory.go", + "synthetic_directory_refs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -45,7 +105,11 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", @@ -58,17 +122,24 @@ go_library( go_test( name = "kernfs_test", size = "small", - srcs = ["kernfs_test.go"], + srcs = [ + "dir_refs.go", + "kernfs_test.go", + "readonly_dir_refs.go", + ], deps = [ ":kernfs", "//pkg/abi/linux", "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", - "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 1568a9d49..485504995 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -34,28 +35,30 @@ import ( // +stateify savable type DynamicBytesFile struct { InodeAttrs + InodeNoStatFS InodeNoopRefCount InodeNotDirectory InodeNotSymlink - data vfs.DynamicBytesSource + locks vfs.FileLocks + data vfs.DynamicBytesSource } var _ Inode = (*DynamicBytesFile)(nil) // Init initializes a dynamic bytes file. -func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { +func (f *DynamicBytesFile) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) + f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) f.data = data } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { + if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil @@ -77,17 +80,19 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent type DynamicBytesFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD vfsfd vfs.FileDescription inode Inode } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error { - if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { + fd.LockFD.Init(locks) + if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return err } - fd.inode = d.Impl().(*Dentry).inode + fd.inode = d.inode fd.SetDataSource(data) return nil } @@ -97,12 +102,12 @@ func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) } -// Read implmenets vfs.FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) } -// PRead implmenets vfs.FileDescriptionImpl.PRead. +// PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) } @@ -118,12 +123,12 @@ func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, of } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DynamicBytesFD) Release() {} +func (fd *DynamicBytesFD) Release(context.Context) {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() - return fd.inode.Stat(fs, opts) + return fd.inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. @@ -131,3 +136,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 8284e76a7..f8dae22f8 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -15,10 +15,11 @@ package kernfs import ( - "math" + "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -27,9 +28,29 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// SeekEndConfig describes the SEEK_END behaviour for FDs. +// +// +stateify savable +type SeekEndConfig int + +// Constants related to SEEK_END behaviour for FDs. +const ( + // Consider the end of the file to be after the final static entry. This is + // the default option. + SeekEndStaticEntries = iota + // Consider the end of the file to be at offset 0. + SeekEndZero +) + +// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD. +// +// +stateify savable +type GenericDirectoryFDOptions struct { + SeekEnd SeekEndConfig +} + // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory -// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not -// compatible with dynamic directories. +// inode that uses OrderChildren to track child nodes. // // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling // IterDirents callback. The IterDirents callback therefore cannot hash or @@ -39,15 +60,21 @@ import ( // Must be initialize with Init before first use. // // Lock ordering: mu => children.mu. +// +// +stateify savable type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl + vfs.LockFD + + // Immutable. + seekEnd SeekEndConfig vfsfd vfs.FileDescription children *OrderedChildren // mu protects the fields below. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // off is the current directory offset. Protected by "mu". off int64 @@ -55,12 +82,12 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} - if err := fd.Init(children, opts); err != nil { + if err := fd.Init(children, locks, opts, fdOpts); err != nil { return nil, err } - if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd, nil @@ -69,11 +96,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } + fd.LockFD.Init(locks) + fd.seekEnd = fdOpts.SeekEnd fd.children = children return nil } @@ -109,18 +138,22 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) } -// Release implements vfs.FileDecriptionImpl.Release. -func (fd *GenericDirectoryFD) Release() {} +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *GenericDirectoryFD) Release(context.Context) {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { return fd.vfsfd.VirtualDentry().Mount().Filesystem() } +func (fd *GenericDirectoryFD) dentry() *Dentry { + return fd.vfsfd.Dentry().Impl().(*Dentry) +} + func (fd *GenericDirectoryFD) inode() Inode { - return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode + return fd.dentry().inode } -// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds // o.mu when calling cb. func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fd.mu.Lock() @@ -129,7 +162,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { - stat, err := fd.inode().Stat(fd.filesystem(), opts) + stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts) if err != nil { return err } @@ -147,9 +180,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { - vfsd := fd.vfsfd.VirtualDentry().Dentry() - parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode - stat, err := parentInode.Stat(fd.filesystem(), opts) + parentInode := genericParentOrSelf(fd.dentry()).inode + stat, err := parentInode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } @@ -172,13 +204,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // these. childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { - inode := it.Dentry.Impl().(*Dentry).inode - stat, err := inode.Stat(fd.filesystem(), opts) + stat, err := it.inode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } dirent := vfs.Dirent{ - Name: it.Name, + Name: it.name, Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: fd.off + 1, @@ -191,11 +222,11 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent var err error relOffset := fd.off - int64(len(fd.children.set)) - 2 - fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset) + fd.off, err = fd.inode().IterDirents(ctx, fd.vfsfd.Mount(), cb, fd.off, relOffset) return err } -// Seek implements vfs.FileDecriptionImpl.Seek. +// Seek implements vfs.FileDescriptionImpl.Seek. func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() @@ -206,9 +237,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: - // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up - // if they are added after SEEK_END. - offset = math.MaxInt64 + switch fd.seekEnd { + case SeekEndStaticEntries: + fd.children.mu.RLock() + offset += int64(len(fd.children.set)) + offset += 2 // '.' and '..' aren't tracked in children. + fd.children.mu.RUnlock() + case SeekEndZero: + // No-op: offset += 0. + default: + panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd)) + } default: return 0, syserror.EINVAL } @@ -223,12 +262,26 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() inode := fd.inode() - return inode.Stat(fs, opts) + return inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode - return inode.SetStat(ctx, fd.filesystem(), creds, opts) + return fd.inode().SetStat(ctx, fd.filesystem(), creds, opts) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 4a12ae245..399895f3e 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -32,11 +32,12 @@ import ( // // stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). // -// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * !rp.Done(). // // Postcondition: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) { - d := vfsd.Impl().(*Dentry) +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } @@ -53,20 +54,20 @@ afterSymlink: // calls d_revalidate(), but walk_component() => handle_dots() does not. if name == "." { rp.Advance() - return vfsd, nil + return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() - return vfsd, nil + return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil { return nil, err } rp.Advance() - return &d.parent.vfsd, nil + return d.parent, nil } if len(name) > linux.NAME_MAX { return nil, syserror.ENAMETOOLONG @@ -77,18 +78,18 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&next.vfsd); err != nil { + if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil { return nil, err } // Resolve any symlink at current path component. - if rp.ShouldFollowSymlink() && next.isSymlink() { + if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() { targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) if err != nil { return nil, err } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + fs.deferDecRefVD(ctx, targetVD) if err != nil { return nil, err } @@ -100,15 +101,18 @@ afterSymlink: goto afterSymlink } rp.Advance() - return &next.vfsd, nil + return next, nil } // revalidateChildLocked must be called after a call to parent.vfsd.Child(name) // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be // nil) to verify that the returned child (or lack thereof) is correct. // -// Preconditions: Filesystem.mu must be locked for at least reading. -// parent.dirMu must be locked. parent.isDir(). name is not "." or "..". +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * parent.dirMu must be locked. +// * parent.isDir(). +// * name is not "." or "..". // // Postconditions: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) { @@ -116,26 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Cached dentry exists, revalidate. if !child.inode.Valid(ctx) { delete(parent.children, name) - vfsObj.InvalidateDentry(&child.vfsd) - fs.deferDecRef(&child.vfsd) // Reference from Lookup. + if child.inode.Keep() { + // Drop the ref owned by kernfs. + fs.deferDecRef(child) + } + vfsObj.InvalidateDentry(ctx, child.VFSDentry()) child = nil } } if child == nil { - // Dentry isn't cached; it either doesn't exist or failed - // revalidation. Attempt to resolve it via Lookup. - // - // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return - // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes - // that all dentries in the filesystem are (kernfs.)Dentry and performs - // vfs.DentryImpl casts accordingly. - childVFSD, err := parent.inode.Lookup(ctx, name) + // Dentry isn't cached; it either doesn't exist or failed revalidation. + // Attempt to resolve it via Lookup. + childInode, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } - // Reference on childVFSD dropped by a corresponding Valid. - child = childVFSD.Impl().(*Dentry) - parent.insertChildLocked(name, child) + var newChild Dentry + newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. + parent.insertChildLocked(name, &newChild) + child = &newChild + + // Drop the ref on newChild. This will cause the dentry to get pruned + // from the dentry tree by the end of current filesystem operation + // (before returning to the VFS layer) if another ref is not picked on + // this dentry. + if !childInode.Keep() { + fs.deferDecRef(&newChild) + } } return child, nil } @@ -148,20 +159,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Preconditions: Filesystem.mu must be locked for at least reading. // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { - vfsd := rp.Start() +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { + d := rp.Start().Impl().(*Dentry) for !rp.Done() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */) if err != nil { - return nil, nil, err + return nil, err } } - d := vfsd.Impl().(*Dentry) if rp.MustBeDir() && !d.isDir() { - return nil, nil, syserror.ENOTDIR + return nil, syserror.ENOTDIR } - return vfsd, d.inode, nil + return d, nil } // walkParentDirLocked resolves all but the last path component of rp to an @@ -171,32 +181,34 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // -// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * !rp.Done(). // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { - vfsd := rp.Start() +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { + d := rp.Start().Impl().(*Dentry) for !rp.Final() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */) if err != nil { - return nil, nil, err + return nil, err } } - d := vfsd.Impl().(*Dentry) if !d.isDir() { - return nil, nil, syserror.ENOTDIR + return nil, syserror.ENOTDIR } - return vfsd, d.inode, nil + return d, nil } // checkCreateLocked checks that a file named rp.Component() may be created in -// directory parentVFSD, then returns rp.Component(). +// directory parent, then returns rp.Component(). // -// Preconditions: Filesystem.mu must be locked for at least reading. parentInode -// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. -func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * isDir(parentInode) == true. +func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return "", err } pc := rp.Component() @@ -206,11 +218,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if len(pc) > linux.NAME_MAX { return "", syserror.ENAMETOOLONG } - // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. - if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { + if _, ok := parent.children[pc]; ok { return "", syserror.EEXIST } - if parentVFSD.IsDead() { + if parent.VFSDentry().IsDead() { return "", syserror.ENOENT } return pc, nil @@ -219,8 +230,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v // checkDeleteLocked checks that the file represented by vfsd may be deleted. // // Preconditions: Filesystem.mu must be locked for at least reading. -func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { - parent := vfsd.Impl().(*Dentry).parent +func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { + parent := d.parent if parent == nil { return syserror.EBUSY } @@ -234,7 +245,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Den } // Release implements vfs.FilesystemImpl.Release. -func (fs *Filesystem) Release() { +func (fs *Filesystem) Release(context.Context) { } // Sync implements vfs.FilesystemImpl.Sync. @@ -246,35 +257,35 @@ func (fs *Filesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } - return inode.CheckPermissions(ctx, creds, ats) + return d.inode.CheckPermissions(ctx, creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } if opts.CheckSearchable { - d := vfsd.Impl().(*Dentry) if !d.isDir() { return nil, syserror.ENOTDIR } - if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } + vfsd := d.VFSDentry() vfsd.IncRef() // Ownership transferred to caller. return vfsd, nil } @@ -282,14 +293,14 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - vfsd, _, err := fs.walkParentDirLocked(ctx, rp) + d, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return nil, err } - vfsd.IncRef() // Ownership transferred to caller. - return vfsd, nil + d.IncRef() // Ownership transferred to caller. + return d.VFSDentry(), nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. @@ -298,13 +309,16 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -321,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) + childI, err := parent.inode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } @@ -335,13 +351,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -349,11 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - childVFSD, err := parentInode.NewDir(ctx, pc, opts) + childI, err := parent.inode.NewDir(ctx, pc, opts) if err != nil { - return err + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode) } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } @@ -363,13 +387,16 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -377,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - newVFSD, err := parentInode.NewNode(ctx, pc, opts) + newI, err := parent.inode.NewNode(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) + var newD Dentry + newD.Init(fs, newI) + parent.insertChildLocked(pc, &newD) return nil } @@ -397,24 +426,41 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - defer fs.processDeferredDecRefs() - defer fs.mu.RUnlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + defer fs.processDeferredDecRefs(ctx) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { + fs.mu.RUnlock() return nil, err } - if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + fs.mu.RUnlock() return nil, err } - return inode.Open(ctx, rp, vfsd, opts) + // Open may block so we need to unlock fs.mu. IncRef d to prevent + // its destruction while fs.mu is unlocked. + d.IncRef() + fs.mu.RUnlock() + fd, err := d.inode.Open(ctx, rp, d, opts) + d.DecRef(ctx) + return fd, err } // May create new file. mustCreate := opts.Flags&linux.O_EXCL != 0 - vfsd := rp.Start() - inode := vfsd.Impl().(*Dentry).inode + d := rp.Start().Impl().(*Dentry) fs.mu.Lock() - defer fs.mu.Unlock() + unlocked := false + unlock := func() { + if !unlocked { + fs.mu.Unlock() + unlocked = true + } + } + // Process all to-be-decref'd dentries at the end at once. + // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked + // when this is executed. + defer fs.processDeferredDecRefs(ctx) + defer unlock() if rp.Done() { if rp.MustBeDir() { return nil, syserror.EISDIR @@ -422,19 +468,24 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(ctx, rp, vfsd, opts) + // Open may block so we need to unlock fs.mu. IncRef d to prevent + // its destruction while fs.mu is unlocked. + d.IncRef() + unlock() + fd, err := d.inode.Open(ctx, rp, d, opts) + d.DecRef(ctx) + return fd, err } afterTrailingSymlink: - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return nil, err } // Check for search permission in the parent directory. - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -449,10 +500,10 @@ afterTrailingSymlink: return nil, syserror.ENAMETOOLONG } // Determine whether or not we need to create a file. - childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD) + child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */) if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -460,13 +511,20 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - childVFSD, err = parentInode.NewFile(ctx, pc, opts) + childI, err := parent.inode.NewFile(ctx, pc, opts) if err != nil { return nil, err } - child := childVFSD.Impl().(*Dentry) - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.inode.Open(ctx, rp, childVFSD, opts) + var child Dentry + child.Init(fs, childI) + parent.insertChild(pc, &child) + // Open may block so we need to unlock fs.mu. IncRef child to prevent + // its destruction while fs.mu is unlocked. + child.IncRef() + unlock() + fd, err := child.inode.Open(ctx, rp, &child, opts) + child.DecRef(ctx) + return fd, err } if err != nil { return nil, err @@ -475,7 +533,6 @@ afterTrailingSymlink: if mustCreate { return nil, syserror.EEXIST } - child := childVFSD.Impl().(*Dentry) if rp.ShouldFollowSymlink() && child.isSymlink() { targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) if err != nil { @@ -483,7 +540,7 @@ afterTrailingSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + fs.deferDecRefVD(ctx, targetVD) if err != nil { return nil, err } @@ -499,22 +556,28 @@ afterTrailingSymlink: if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return child.inode.Open(ctx, rp, &child.vfsd, opts) + // Open may block so we need to unlock fs.mu. IncRef child to prevent + // its destruction while fs.mu is unlocked. + child.IncRef() + unlock() + fd, err := child.inode.Open(ctx, rp, child, opts) + child.DecRef(ctx) + return fd, err } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() - d, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return "", err } - if !d.Impl().(*Dentry).isSymlink() { + if !d.isSymlink() { return "", syserror.EINVAL } - return inode.Readlink(ctx) + return d.inode.Readlink(ctx, rp.Mount()) } // RenameAt implements vfs.FilesystemImpl.RenameAt. @@ -526,16 +589,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.processDeferredDecRefsLocked() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this // Mount. - dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + dstDir, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - dstDir := dstDirVFSD.Impl().(*Dentry) mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV @@ -553,16 +615,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } - srcVFSD := &src.vfsd // Can we remove the src dentry? - if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil { + if err := checkDeleteLocked(ctx, rp, src); err != nil { return err } // Can we create the dst dentry? var dst *Dentry - pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) + pc, err := checkCreateLocked(ctx, rp, dstDir) switch err { case nil: // Ok, continue with rename as replacement. @@ -573,18 +634,18 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } dst = dstDir.children[pc] if dst == nil { - panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir)) } default: return err } var dstVFSD *vfs.Dentry if dst != nil { - dstVFSD = &dst.vfsd + dstVFSD = dst.VFSDentry() } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) virtfs := rp.VirtualFilesystem() // We can't deadlock here due to lock ordering because we're protected from @@ -596,35 +657,44 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer dstDir.dirMu.Unlock() } + srcVFSD := src.VFSDentry() if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) + err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } delete(srcDir.children, src.name) if srcDir != dstDir { - fs.deferDecRef(srcDirVFSD) - dstDir.IncRef() + fs.deferDecRef(srcDir) // child (src) drops ref on old parent. + dstDir.IncRef() // child (src) takes a ref on the new parent. } src.parent = dstDir src.name = pc if dstDir.children == nil { dstDir.children = make(map[string]*Dentry) } + replaced := dstDir.children[pc] dstDir.children[pc] = src - virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) + var replaceVFSD *vfs.Dentry + if replaced != nil { + // deferDecRef so that fs.mu and dstDir.mu are unlocked by then. + fs.deferDecRef(replaced) + replaceVFSD = replaced.VFSDentry() + } + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -632,14 +702,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } - d := vfsd.Impl().(*Dentry) if !d.isDir() { return syserror.ENOTDIR } - if inode.HasChildren() { + if d.inode.HasChildren() { return syserror.ENOTEMPTY } virtfs := rp.VirtualFilesystem() @@ -648,56 +717,60 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) + vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + + if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + delete(parentDentry.children, d.name) + // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. + fs.deferDecRef(d) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } if opts.Stat.Mask == 0 { return nil } - return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) + return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return linux.Statx{}, err } - return inode.Stat(fs.VFSFilesystem(), opts) + return d.inode.Stat(ctx, fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issue/1193): actually implement statfs. - return linux.Statfs{}, syserror.ENOSYS + return d.inode.StatFS(ctx, fs.VFSFilesystem()) } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. @@ -706,13 +779,16 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -720,20 +796,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - childVFSD, err := parentInode.NewSymlink(ctx, pc, target) + childI, err := parent.inode.NewSymlink(ctx, pc, target) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - vfsd, _, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -741,10 +820,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } - d := vfsd.Impl().(*Dentry) if d.isDir() { return syserror.EISDIR } @@ -753,39 +831,43 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) + vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + delete(parentDentry.children, d.name) + // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. + fs.deferDecRef(d) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } - if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } @@ -793,12 +875,12 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, syserror.ENOTSUP } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return "", err } @@ -806,12 +888,12 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", syserror.ENOTSUP } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -819,12 +901,12 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return syserror.ENOTSUP } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -838,3 +920,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe defer fs.mu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } + +func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) { + if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // The following is equivalent to vd.DecRef(ctx). This is needed + // because if d belongs to this filesystem, we can not DecRef it right + // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we + // defer the DecRef to when locks are dropped. + vd.Mount().DecRef(ctx) + fs.deferDecRef(d) + } else { + vd.DecRef(ctx) + } +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 982daa2e6..d9d76758a 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,11 +20,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // InodeNoopRefCount partially implements the Inode interface, specifically the @@ -32,7 +33,10 @@ import ( // count for inodes, performing no extra actions when references are obtained or // released. This is suitable for simple file inodes that don't reference any // resources. +// +// +stateify savable type InodeNoopRefCount struct { + InodeTemporary } // IncRef implements Inode.IncRef. @@ -40,7 +44,7 @@ func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. -func (InodeNoopRefCount) DecRef() { +func (InodeNoopRefCount) DecRef(context.Context) { } // TryIncRef implements Inode.TryIncRef. @@ -48,37 +52,35 @@ func (InodeNoopRefCount) TryIncRef() bool { return true } -// Destroy implements Inode.Destroy. -func (InodeNoopRefCount) Destroy() { -} - // InodeDirectoryNoNewChildren partially implements the Inode interface. // InodeDirectoryNoNewChildren represents a directory inode which does not // support creation of new children. +// +// +stateify savable type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. -func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { return nil, syserror.EPERM } // NewLink implements Inode.NewLink. -func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { return nil, syserror.EPERM } @@ -86,7 +88,10 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not // represent directories can embed this to provide no-op implementations for // directory-related functions. +// +// +stateify savable type InodeNotDirectory struct { + InodeAlwaysValid } // HasChildren implements Inode.HasChildren. @@ -95,89 +100,64 @@ func (InodeNotDirectory) HasChildren() bool { } // NewFile implements Inode.NewFile. -func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. -func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. -func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. -func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. -func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. -func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) Unlink(context.Context, string, Inode) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. -func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) RmDir(context.Context, string, Inode) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. -func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { +func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. -func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) { panic("Lookup called on non-directory inode") } // IterDirents implements Inode.IterDirents. -func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { +func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { panic("IterDirents called on non-directory inode") } -// Valid implements Inode.Valid. -func (InodeNotDirectory) Valid(context.Context) bool { - return true -} - -// InodeNoDynamicLookup partially implements the Inode interface, specifically -// the inodeDynamicLookup sub interface. Directory inodes that do not support -// dymanic entries (i.e. entries that are not "hashed" into the -// vfs.Dentry.children) can embed this to provide no-op implementations for -// functions related to dynamic entries. -type InodeNoDynamicLookup struct{} - -// Lookup implements Inode.Lookup. -func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - return nil, syserror.ENOENT -} - -// IterDirents implements Inode.IterDirents. -func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { - return offset, nil -} - -// Valid implements Inode.Valid. -func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { - return true -} - // InodeNotSymlink partially implements the Inode interface, specifically the // inodeSymlink sub interface. All inodes that are not symlinks may embed this // to return the appropriate errors from symlink-related functions. +// +// +stateify savable type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (InodeNotSymlink) Readlink(context.Context) (string, error) { +func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { return "", syserror.EINVAL } @@ -191,18 +171,26 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // inode attributes. // // Must be initialized by Init prior to first use. +// +// +stateify savable type InodeAttrs struct { - devMajor uint32 - devMinor uint32 - ino uint64 - mode uint32 - uid uint32 - gid uint32 - nlink uint32 + devMajor uint32 + devMinor uint32 + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 + blockSize uint32 + + // Timestamps, all nsecs from the Unix epoch. + atime int64 + mtime int64 + ctime int64 } // Init initializes this InodeAttrs. -func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { +func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { if mode.FileType() == 0 { panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) } @@ -218,6 +206,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, in atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID)) atomic.StoreUint32(&a.nlink, nlink) + atomic.StoreUint32(&a.blockSize, usermem.PageSize) + now := ktime.NowFromContext(ctx).Nanoseconds() + atomic.StoreInt64(&a.atime, now) + atomic.StoreInt64(&a.mtime, now) + atomic.StoreInt64(&a.ctime, now) } // DevMajor returns the device major number. @@ -240,12 +233,33 @@ func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(atomic.LoadUint32(&a.mode)) } +// TouchAtime updates a.atime to the current time. +func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) { + if mnt.Flags.NoATime || mnt.ReadOnly() { + return + } + if err := mnt.CheckBeginWrite(); err != nil { + return + } + atomic.StoreInt64(&a.atime, ktime.NowFromContext(ctx).Nanoseconds()) + mnt.EndWrite() +} + +// TouchCMtime updates a.{c/m}time to the current time. The caller should +// synchronize calls to this so that ctime and mtime are updated to the same +// value. +func (a *InodeAttrs) TouchCMtime(ctx context.Context) { + now := ktime.NowFromContext(ctx).Nanoseconds() + atomic.StoreInt64(&a.mtime, now) + atomic.StoreInt64(&a.ctime, now) +} + // Stat partially implements Inode.Stat. Note that this function doesn't provide // all the stat fields, and the embedder should consider extending the result // with filesystem-specific fields. -func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { +func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME stat.DevMajor = a.devMajor stat.DevMinor = a.devMinor stat.Ino = atomic.LoadUint64(&a.ino) @@ -253,9 +267,10 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) stat.UID = atomic.LoadUint32(&a.uid) stat.GID = atomic.LoadUint32(&a.gid) stat.Nlink = atomic.LoadUint32(&a.nlink) - - // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. - + stat.Blksize = atomic.LoadUint32(&a.blockSize) + stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.atime)) + stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.mtime)) + stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.ctime)) return stat, nil } @@ -264,10 +279,18 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut if opts.Stat.Mask == 0 { return nil } - if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. Setting the size is often + // allowed by kernfs files but does not do anything. If some other behavior is + // needed, the embedder should consider extending SetStat. + if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { + return syserror.EISDIR + } + if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } @@ -289,10 +312,19 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut atomic.StoreUint32(&a.gid, stat.GID) } - // Note that not all fields are modifiable. For example, the file type and - // inode numbers are immutable after node creation. - - // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. + now := ktime.NowFromContext(ctx).Nanoseconds() + if stat.Mask&linux.STATX_ATIME != 0 { + if stat.Atime.Nsec == linux.UTIME_NOW { + stat.Atime = linux.NsecToStatxTimestamp(now) + } + atomic.StoreInt64(&a.atime, stat.Atime.ToNsec()) + } + if stat.Mask&linux.STATX_MTIME != 0 { + if stat.Mtime.Nsec == linux.UTIME_NOW { + stat.Mtime = linux.NsecToStatxTimestamp(now) + } + atomic.StoreInt64(&a.mtime, stat.Mtime.ToNsec()) + } return nil } @@ -323,13 +355,17 @@ func (a *InodeAttrs) DecLinks() { } } +// +stateify savable type slot struct { - Name string - Dentry *vfs.Dentry + name string + inode Inode + static bool slotEntry } // OrderedChildrenOptions contains initialization options for OrderedChildren. +// +// +stateify savable type OrderedChildrenOptions struct { // Writable indicates whether vfs.FilesystemImpl methods implemented by // OrderedChildren may modify the tracked children. This applies to @@ -339,20 +375,28 @@ type OrderedChildrenOptions struct { } // OrderedChildren partially implements the Inode interface. OrderedChildren can -// be embedded in directory inodes to keep track of the children in the +// be embedded in directory inodes to keep track of children in the // directory, and can then be used to implement a generic directory FD -- see -// GenericDirectoryFD. OrderedChildren is not compatible with dynamic -// directories. +// GenericDirectoryFD. +// +// OrderedChildren can represent a node in an Inode tree. The children inodes +// might be directories themselves using OrderedChildren; hence extending the +// tree. The parent inode (OrderedChildren user) holds a ref on all its static +// children. This lets the static inodes outlive their associated dentry. +// While the dentry might have to be regenerated via a Lookup() call, we can +// keep reusing the same static inode. These static children inodes are finally +// DecRef'd when this directory inode is being destroyed. This makes +// OrderedChildren suitable for static directory entries as well. // // Must be initialize with Init before first use. +// +// +stateify savable type OrderedChildren struct { - refs.AtomicRefCount - // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` order slotList set map[string]*slot } @@ -363,39 +407,66 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { o.set = make(map[string]*slot) } -// DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef() { - o.AtomicRefCount.DecRefWithDestructor(o.Destroy) -} - -// Destroy cleans up resources referenced by this OrderedChildren. -func (o *OrderedChildren) Destroy() { +// Destroy clears the children stored in o. It should be called by structs +// embedding OrderedChildren upon destruction, i.e. when their reference count +// reaches zero. +func (o *OrderedChildren) Destroy(ctx context.Context) { o.mu.Lock() defer o.mu.Unlock() + // Drop the ref that o owns on the static inodes it holds. + for _, s := range o.set { + if s.static { + s.inode.DecRef(ctx) + } + } o.order.Reset() o.set = nil } -// Populate inserts children into this OrderedChildren, and d's dentry -// cache. Populate returns the number of directories inserted, which the caller +// Populate inserts static children into this OrderedChildren. +// Populate returns the number of directories inserted, which the caller // may use to update the link count for the parent directory. // -// Precondition: d must represent a directory inode. children must not contain -// any conflicting entries already in o. -func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { +// Precondition: +// * d must represent a directory inode. +// * children must not contain any conflicting entries already in o. +// * Caller must hold a reference on all inodes passed. +// +// Postcondition: Caller's references on inodes are transferred to o. +func (o *OrderedChildren) Populate(children map[string]Inode) uint32 { var links uint32 for name, child := range children { - if child.isDir() { + if child.Mode().IsDir() { links++ } - if err := o.Insert(name, child.VFSDentry()); err != nil { - panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + if err := o.insert(name, child, true); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child)) } - d.InsertChild(name, child) } return links } +// Lookup implements Inode.Lookup. +func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) { + o.mu.RLock() + defer o.mu.RUnlock() + + s, ok := o.set[name] + if !ok { + return nil, syserror.ENOENT + } + + s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. + return s.inode, nil +} + +// IterDirents implements Inode.IterDirents. +func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + // All entries from OrderedChildren have already been handled in + // GenericDirectoryFD.IterDirents. + return offset, nil +} + // HasChildren implements Inode.HasChildren. func (o *OrderedChildren) HasChildren() bool { o.mu.RLock() @@ -403,17 +474,27 @@ func (o *OrderedChildren) HasChildren() bool { return len(o.set) > 0 } -// Insert inserts child into o. This ignores the writability of o, as this is -// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. -func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { +// Insert inserts a dynamic child into o. This ignores the writability of o, as +// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child Inode) error { + return o.insert(name, child, false) +} + +// insert inserts child into o. +// +// Precondition: Caller must be holding a ref on child if static is true. +// +// Postcondition: Caller's ref on child is transferred to o if static is true. +func (o *OrderedChildren) insert(name string, child Inode, static bool) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { return syserror.EEXIST } s := &slot{ - Name: name, - Dentry: child, + name: name, + inode: child, + static: static, } o.order.PushBack(s) o.set[name] = s @@ -423,44 +504,49 @@ func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { // Precondition: caller must hold o.mu for writing. func (o *OrderedChildren) removeLocked(name string) { if s, ok := o.set[name]; ok { + if s.static { + panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode)) + } delete(o.set, name) o.order.Remove(s) } } // Precondition: caller must hold o.mu for writing. -func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { +func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) { if s, ok := o.set[name]; ok { + if s.static { + panic(fmt.Sprintf("replacing a static inode: %v", s.inode)) + } + // Existing slot with given name, simply replace the dentry. - var old *vfs.Dentry - old, s.Dentry = s.Dentry, new - return old + s.inode = newI } // No existing slot with given name, create and hash new slot. s := &slot{ - Name: name, - Dentry: new, + name: name, + inode: newI, + static: false, } o.order.PushBack(s) o.set[name] = s - return nil } // Precondition: caller must hold o.mu for reading or writing. -func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { +func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { return syserror.ENOENT } - if s.Dentry != child { - panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + if s.inode != child { + panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child)) } return nil } // Unlink implements Inode.Unlink. -func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error { if !o.writable { return syserror.EPERM } @@ -469,17 +555,20 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De if err := o.checkExistingLocked(name, child); err != nil { return err } + + // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. o.removeLocked(name) return nil } -// Rmdir implements Inode.Rmdir. -func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { +// RmDir implements Inode.RmDir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error { // We're not responsible for checking that child is a directory, that it's // empty, or updating any link counts; so this is the same as unlink. return o.Unlink(ctx, name, child) } +// +stateify savable type renameAcrossDifferentImplementationsError struct{} func (renameAcrossDifferentImplementationsError) Error() string { @@ -495,13 +584,13 @@ func (renameAcrossDifferentImplementationsError) Error() string { // that will support Rename. // // Postcondition: reference on any replaced dentry transferred to caller. -func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { - dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error { + dst, ok := dstDir.(interface{}).(*OrderedChildren) if !ok { - return nil, renameAcrossDifferentImplementationsError{} + return renameAcrossDifferentImplementationsError{} } if !o.writable || !dst.writable { - return nil, syserror.EPERM + return syserror.EPERM } // Note: There's a potential deadlock below if concurrent calls to Rename // refer to the same src and dst directories in reverse. We avoid any @@ -514,10 +603,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c defer dst.mu.Unlock() } if err := o.checkExistingLocked(oldname, child); err != nil { - return nil, err + return err } - replaced := dst.replaceChildLocked(newname, child) - return replaced, nil + + // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. + dst.replaceChildLocked(ctx, newname, child) + return nil } // nthLocked returns an iterator to the nth child tracked by this object. The @@ -536,12 +627,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot { } // InodeSymlink partially implements Inode interface for symlinks. +// +// +stateify savable type InodeSymlink struct { InodeNotDirectory } // Open implements Inode.Open. -func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -550,41 +643,46 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D // // +stateify savable type StaticDirectory struct { - InodeNotSymlink - InodeDirectoryNoNewChildren + InodeAlwaysValid InodeAttrs - InodeNoDynamicLookup + InodeDirectoryNoNewChildren + InodeNoStatFS + InodeNotSymlink + InodeTemporary OrderedChildren + StaticDirectoryRefs + + locks vfs.FileLocks + fdOpts GenericDirectoryFDOptions } var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { +func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode { inode := &StaticDirectory{} - inode.Init(creds, devMajor, devMinor, ino, perm) - - dentry := &Dentry{} - dentry.Init(inode) + inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts) + inode.EnableLeakCheck() inode.OrderedChildren.Init(OrderedChildrenOptions{}) - links := inode.OrderedChildren.Populate(dentry, children) + links := inode.OrderedChildren.Populate(children) inode.IncLinks(links) - return dentry + return inode } // Init initializes StaticDirectory. -func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { +func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) + s.fdOpts = fdOpts + s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } -// Open implements kernfs.Inode. -func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts) +// Open implements Inode.Open. +func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { return nil, err } @@ -596,10 +694,38 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti return syserror.EPERM } -// AlwaysValid partially implements kernfs.inodeDynamicLookup. -type AlwaysValid struct{} +// DecRef implements Inode.DecRef. +func (s *StaticDirectory) DecRef(ctx context.Context) { + s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) }) +} + +// InodeAlwaysValid partially implements Inode. +// +// +stateify savable +type InodeAlwaysValid struct{} -// Valid implements kernfs.inodeDynamicLookup. -func (*AlwaysValid) Valid(context.Context) bool { +// Valid implements Inode.Valid. +func (*InodeAlwaysValid) Valid(context.Context) bool { return true } + +// InodeTemporary partially implements Inode. +// +// +stateify savable +type InodeTemporary struct{} + +// Keep implements Inode.Keep. +func (*InodeTemporary) Keep() bool { + return false +} + +// InodeNoStatFS partially implements the Inode interface, where the client +// filesystem doesn't support statfs(2). +// +// +stateify savable +type InodeNoStatFS struct{} + +// StatFS implements Inode.StatFS. +func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index bbee8ccda..5c5e09ac5 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -29,12 +29,16 @@ // // Reference Model: // -// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// Kernfs dentries represents named pointers to inodes. Kernfs is solely +// reponsible for maintaining and modifying its dentry tree; inode +// implementations can not access the tree. Dentries and inodes have // independent lifetimes and reference counts. A child dentry unconditionally // holds a reference on its parent directory's dentry. A dentry also holds a -// reference on the inode it points to. Multiple dentries can point to the same -// inode (for example, in the case of hardlinks). File descriptors hold a -// reference to the dentry they're opened on. +// reference on the inode it points to (although that might not be the only +// reference on the inode). Due to this inodes can outlive the dentries that +// point to them. Multiple dentries can point to the same inode (for example, +// in the case of hardlinks). File descriptors hold a reference to the dentry +// they're opened on. // // Dentries are guaranteed to exist while holding Filesystem.mu for // reading. Dropping dentries require holding Filesystem.mu for writing. To @@ -47,8 +51,8 @@ // kernfs.Dentry.dirMu // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu -// kernfs.Filesystem.droppedDentriesMu // (inode implementation locks, if any) +// kernfs.Filesystem.droppedDentriesMu package kernfs import ( @@ -57,7 +61,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -66,15 +69,17 @@ import ( // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. +// +// +stateify savable type Filesystem struct { vfsfs vfs.Filesystem - droppedDentriesMu sync.Mutex + droppedDentriesMu sync.Mutex `state:"nosave"` // droppedDentries is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for // writing. Protected by droppedDentriesMu. - droppedDentries []*vfs.Dentry + droppedDentries []*Dentry // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but @@ -93,22 +98,32 @@ type Filesystem struct { // example: // // fs.mu.RLock() - // fs.mu.processDeferredDecRefs() + // defer fs.processDeferredDecRefs() // defer fs.mu.RUnlock() // ... // fs.deferDecRef(dentry) - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` // nextInoMinusOne is used to to allocate inode numbers on this // filesystem. Must be accessed by atomic operations. nextInoMinusOne uint64 + + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by mu. + cachedDentries dentryList + cachedDentriesLen uint64 + + // MaxCachedDentries is the maximum size of cachedDentries. If not set, + // defaults to 0 and kernfs does not cache any dentries. This is immutable. + MaxCachedDentries uint64 } // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. -// -// Precondition: d must not already be pending destruction. -func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { +// This may be called while Filesystem.mu or Dentry.dirMu is locked. +func (fs *Filesystem) deferDecRef(d *Dentry) { fs.droppedDentriesMu.Lock() fs.droppedDentries = append(fs.droppedDentries, d) fs.droppedDentriesMu.Unlock() @@ -116,17 +131,14 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. -func (fs *Filesystem) processDeferredDecRefs() { - fs.mu.Lock() - fs.processDeferredDecRefsLocked() - fs.mu.Unlock() -} - -// Precondition: fs.mu must be held for writing. -func (fs *Filesystem) processDeferredDecRefsLocked() { +// +// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. +func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { - d.DecRef() + // Defer the DecRef call so that we are not holding droppedDentriesMu + // when DecRef is called. + defer d.DecRef(ctx) } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() @@ -155,15 +167,24 @@ const ( // // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a // named reference to an inode. A dentry generally lives as long as it's part of -// a mounted filesystem tree. Kernfs doesn't cache dentries once all references -// to them are removed. Dentries hold a single reference to the inode they point +// a mounted filesystem tree. Kernfs drops dentries once all references to them +// are dropped. Dentries hold a single reference to the inode they point // to, and child dentries hold a reference on their parent. // // Must be initialized by Init prior to first use. +// +// +stateify savable type Dentry struct { vfsd vfs.Dentry - refs.AtomicRefCount + // refs is the reference count. When refs reaches 0, the dentry may be + // added to the cache or destroyed. If refs == -1, the dentry has already + // been destroyed. refs are allowed to go to 0 and increase again. refs is + // accessed using atomic memory operations. + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *Filesystem // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. @@ -172,21 +193,177 @@ type Dentry struct { parent *Dentry name string + // If cached is true, dentryEntry links dentry into + // Filesystem.cachedDentries. cached and dentryEntry are protected by + // Filesystem.mu. + cached bool + dentryEntry + // dirMu protects children and the names of child Dentries. - dirMu sync.Mutex + // + // Note that holding fs.mu for writing is not sufficient; + // revalidateChildLocked(), which is a very hot path, may modify children with + // fs.mu acquired for reading only. + dirMu sync.Mutex `state:"nosave"` children map[string]*Dentry inode Inode } +// IncRef implements vfs.DentryImpl.IncRef. +func (d *Dentry) IncRef() { + // d.refs may be 0 if d.fs.mu is locked, which serializes against + // d.cacheLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *Dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef(ctx context.Context) { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.mu.Lock() + d.cacheLocked(ctx) + d.fs.mu.Unlock() + } else if refs < 0 { + panic("kernfs.Dentry.DecRef() called without holding a reference") + } +} + +// cacheLocked should be called after d's reference count becomes 0. The ref +// count check may happen before acquiring d.fs.mu so there might be a race +// condition where the ref count is increased again by the time the caller +// acquires d.fs.mu. This race is handled. +// Only reachable dentries are added to the cache. However, a dentry might +// become unreachable *while* it is in the cache due to invalidation. +// +// Preconditions: d.fs.mu must be locked for writing. +func (d *Dentry) cacheLocked(ctx context.Context) { + // Dentries with a non-zero reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires d.fs.mu, so if d.refs is zero then it will + // remain zero while we hold d.fs.mu for writing.) + refs := atomic.LoadInt64(&d.refs) + if refs == -1 { + // Dentry has already been destroyed. + panic(fmt.Sprintf("cacheLocked called on a dentry which has already been destroyed: %v", d)) + } + if refs > 0 { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + return + } + // If the dentry is deleted and invalidated or has no parent, then it is no + // longer reachable by path resolution and should be dropped immediately + // because it has zero references. + // Note that a dentry may not always have a parent; for example magic links + // as described in Inode.Getlink. + if isDead := d.VFSDentry().IsDead(); isDead || d.parent == nil { + if !isDead { + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) + } + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + d.destroyLocked(ctx) + return + } + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries { + return + } + // Evict the least recently used dentry because cache size is greater than + // max cache size (configured on mount). + victim := d.fs.cachedDentries.Back() + d.fs.cachedDentries.Remove(victim) + d.fs.cachedDentriesLen-- + victim.cached = false + // victim.refs may have become non-zero from an earlier path resolution + // after it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) == 0 { + if !victim.vfsd.IsDead() { + victim.parent.dirMu.Lock() + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, victim.VFSDentry()) + delete(victim.parent.children, victim.name) + victim.parent.dirMu.Unlock() + } + victim.destroyLocked(ctx) + } + // Whether or not victim was destroyed, we brought fs.cachedDentriesLen + // back down to fs.MaxCachedDentries, so we don't loop. +} + +// destroyLocked destroys the dentry. +// +// Preconditions: +// * d.fs.mu must be locked for writing. +// * d.refs == 0. +// * d should have been removed from d.parent.children, i.e. d is not reachable +// by path traversal. +// * d.vfsd.IsDead() is true. +func (d *Dentry) destroyLocked(ctx context.Context) { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("dentry.destroyLocked() called on already destroyed dentry") + default: + panic("dentry.destroyLocked() called with references on the dentry") + } + + d.inode.DecRef(ctx) // IncRef from Init. + d.inode = nil + + // Drop the reference held by d on its parent without recursively locking + // d.fs.mu. + if d.parent != nil { + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.cacheLocked(ctx) + } else if refs < 0 { + panic("kernfs.Dentry.DecRef() called without holding a reference") + } + } +} + // Init initializes this dentry. // // Precondition: Caller must hold a reference on inode. // // Postcondition: Caller's reference on inode is transferred to the dentry. -func (d *Dentry) Init(inode Inode) { +func (d *Dentry) Init(fs *Filesystem, inode Inode) { d.vfsd.Init(d) + d.fs = fs d.inode = inode + atomic.StoreInt64(&d.refs, 1) ftype := inode.Mode().FileType() if ftype == linux.ModeDirectory { d.flags |= dflagsIsDir @@ -211,51 +388,44 @@ func (d *Dentry) isSymlink() bool { return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0 } -// DecRef implements vfs.DentryImpl.DecRef. -func (d *Dentry) DecRef() { - d.AtomicRefCount.DecRefWithDestructor(d.destroy) -} - -// Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy() { - d.inode.DecRef() // IncRef from Init. - d.inode = nil - if d.parent != nil { - d.parent.DecRef() // IncRef from Dentry.InsertChild. - } -} - // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *Dentry) Watches() *vfs.Watches { return nil } -// InsertChild inserts child into the vfs dentry cache with the given name under -// this dentry. This does not update the directory inode, so calling this on -// its own isn't sufficient to insert a child into a directory. InsertChild -// updates the link count on d if required. +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +func (d *Dentry) OnZeroWatches(context.Context) {} + +// insertChild inserts child into the vfs dentry cache with the given name under +// this dentry. This does not update the directory inode, so calling this on its +// own isn't sufficient to insert a child into a directory. // -// Precondition: d must represent a directory inode. -func (d *Dentry) InsertChild(name string, child *Dentry) { +// Preconditions: +// * d must represent a directory inode. +// * d.fs.mu must be locked for at least reading. +func (d *Dentry) insertChild(name string, child *Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() } -// insertChildLocked is equivalent to InsertChild, with additional +// insertChildLocked is equivalent to insertChild, with additional // preconditions. // -// Precondition: d.dirMu must be locked. +// Preconditions: +// * d must represent a directory inode. +// * d.dirMu must be locked. +// * d.fs.mu must be locked for at least reading. func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { - panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d)) } d.IncRef() // DecRef in child's Dentry.destroy. child.parent = d @@ -286,7 +456,6 @@ func (d *Dentry) Inode() Inode { // // - Checking that dentries passed to methods are of the appropriate file type. // - Checking permissions. -// - Updating link and reference counts. // // Specific responsibilities of implementations are documented below. type Inode interface { @@ -296,7 +465,8 @@ type Inode interface { inodeRefs // Methods related to node metadata. A generic implementation is provided by - // InodeAttrs. + // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for + // managing link counts. inodeMetadata // Method for inodes that represent symlink. InodeNotSymlink provides a @@ -307,28 +477,32 @@ type Inode interface { // a blanket implementation for all non-directory inodes. inodeDirectory - // Method for inodes that represent dynamic directories and their - // children. InodeNoDynamicLookup provides a blanket implementation for all - // non-dynamic-directory inodes. - inodeDynamicLookup - // Open creates a file description for the filesystem object represented by // this inode. The returned file description should hold a reference on the - // inode for its lifetime. + // dentry for its lifetime. // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. - Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + + // StatFS returns filesystem statistics for the client filesystem. This + // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem + // doesn't support statfs(2), this should return ENOSYS. + StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) + + // Keep indicates whether the dentry created after Inode.Lookup should be + // kept in the kernfs dentry tree. + Keep() bool + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool } type inodeRefs interface { IncRef() - DecRef() + DecRef(ctx context.Context) TryIncRef() bool - // Destroy is called when the inode reaches zero references. Destroy release - // all resources (references) on objects referenced by the inode, including - // any child dentries. - Destroy() } type inodeMetadata interface { @@ -343,7 +517,7 @@ type inodeMetadata interface { // Stat returns the metadata for this inode. This corresponds to // vfs.FilesystemImpl.StatAt. - Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) + Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking @@ -355,8 +529,8 @@ type inodeMetadata interface { // Precondition: All methods in this interface may only be called on directory // inodes. type inodeDirectory interface { - // The New{File,Dir,Node,Symlink} methods below should return a new inode - // hashed into this inode. + // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode + // that will be hashed into the dentry tree. // // These inode constructors are inode-level operations rather than // filesystem-level operations to allow client filesystems to mix different @@ -367,75 +541,69 @@ type inodeDirectory interface { HasChildren() bool // NewFile creates a new regular file inode. - NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) // NewDir creates a new directory inode. - NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) // NewLink creates a new hardlink to a specified inode in this // directory. Implementations should create a new kernfs Dentry pointing to // target, and update target's link count. - NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + NewLink(ctx context.Context, name string, target Inode) (Inode, error) // NewSymlink creates a new symbolic link inode. - NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + NewSymlink(ctx context.Context, name, target string) (Inode, error) // NewNode creates a new filesystem node for a mknod syscall. - NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) // Unlink removes a child dentry from this directory inode. - Unlink(ctx context.Context, name string, child *vfs.Dentry) error + Unlink(ctx context.Context, name string, child Inode) error // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child // is a directory, checking for an empty directory. - RmDir(ctx context.Context, name string, child *vfs.Dentry) error + RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being // renamed. child should point to the resolved child in the source - // directory. If Rename replaces a dentry in the destination directory, it - // should return the replaced dentry or nil otherwise. + // directory. // // Precondition: Caller must serialize concurrent calls to Rename. - Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) -} + Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error -type inodeDynamicLookup interface { - // Lookup should return an appropriate dentry if name should resolve to a - // child of this dynamic directory inode. This gives the directory an - // opportunity on every lookup to resolve additional entries that aren't - // hashed into the directory. This is only called when the inode is a - // directory. If the inode is not a directory, or if the directory only - // contains a static set of children, the implementer can unconditionally - // return an appropriate error (ENOTDIR and ENOENT respectively). + // Lookup should return an appropriate inode if name should resolve to a + // child of this directory inode. This gives the directory an opportunity + // on every lookup to resolve additional entries. This is only called when + // the inode is a directory. // - // The child returned by Lookup will be hashed into the VFS dentry tree. Its - // lifetime can be controlled by the filesystem implementation with an - // appropriate implementation of Valid. + // The child returned by Lookup will be hashed into the VFS dentry tree, + // at least for the duration of the current FS operation. // - // Lookup returns the child with an extra reference and the caller owns this - // reference. - Lookup(ctx context.Context, name string) (*vfs.Dentry, error) - - // Valid should return true if this inode is still valid, or needs to - // be resolved again by a call to Lookup. - Valid(ctx context.Context) bool + // Lookup must return the child with an extra reference whose ownership is + // transferred to the dentry that is created to point to that inode. If + // Inode.Keep returns false, that new dentry will be dropped at the end of + // the current filesystem operation (before returning back to the VFS + // layer) if no other ref is picked on that dentry. If Inode.Keep returns + // true, then the dentry will be cached into the dentry tree until it is + // Unlink'd or RmDir'd. + Lookup(ctx context.Context, name string) (Inode, error) // IterDirents is used to iterate over dynamically created entries. It invokes - // cb on each entry in the directory represented by the FileDescription. + // cb on each entry in the directory represented by the Inode. // 'offset' is the offset for the entire IterDirents call, which may include - // results from the caller. 'relOffset' is the offset inside the entries - // returned by this IterDirents invocation. In other words, - // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff, - // while 'relOffset' is the place where iteration should start from. - IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) + // results from the caller (e.g. "." and ".."). 'relOffset' is the offset + // inside the entries returned by this IterDirents invocation. In other words, + // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as + // the return value, while 'relOffset' is the place to start iteration. + IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) } type inodeSymlink interface { // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. - Readlink(ctx context.Context) (string, error) + Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) // Getlink returns the target of a symbolic link, as used by path // resolution: diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 412cf6ac9..2418eec44 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file." // RootDentryFn is a generator function for creating the root dentry of a test // filesystem. See newTestSystem. -type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry +type RootDentryFn func(context.Context, *auth.Credentials, *filesystem) kernfs.Inode // newTestSystem sets up a minimal environment for running a test, including an // instance of a test filesystem. Tests can control the contents of the @@ -46,13 +46,13 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) v := &vfs.VirtualFilesystem{} - if err := v.Init(); err != nil { + if err := v.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create testfs root mount: %v", err) } @@ -72,14 +72,11 @@ type file struct { content string } -func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { +func (fs *filesystem) newFile(ctx context.Context, creds *auth.Credentials, content string) kernfs.Inode { f := &file{} f.content = content - f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) - - d := &kernfs.Dentry{} - d.Init(f) - return d + f.DynamicBytesFile.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) + return f } func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { @@ -96,96 +93,112 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S } type readonlyDir struct { + readonlyDirRefs attrs - kernfs.InodeNotSymlink - kernfs.InodeNoDynamicLookup + kernfs.InodeAlwaysValid kernfs.InodeDirectoryNoNewChildren - + kernfs.InodeNoStatFS + kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren - dentry kernfs.Dentry + + locks vfs.FileLocks } -func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newReadonlyDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { dir := &readonlyDir{} - dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - dir.dentry.Init(dir) - - dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) - - return &dir.dentry + dir.EnableLeakCheck() + dir.IncLinks(dir.OrderedChildren.Populate(contents)) + return dir } -func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) +func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } +func (d *readonlyDir) DecRef(ctx context.Context) { + d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) }) +} + type dir struct { + dirRefs attrs + kernfs.InodeAlwaysValid kernfs.InodeNotSymlink - kernfs.InodeNoDynamicLookup - - fs *filesystem - dentry kernfs.Dentry + kernfs.InodeNoStatFS + kernfs.InodeTemporary kernfs.OrderedChildren + + locks vfs.FileLocks + + fs *filesystem } -func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { dir := &dir{} dir.fs = fs - dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) - dir.dentry.Init(dir) - - dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + dir.EnableLeakCheck() - return &dir.dentry + dir.IncLinks(dir.OrderedChildren.Populate(contents)) + return dir } -func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } -func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { +func (d *dir) DecRef(ctx context.Context) { + d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +} + +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { creds := auth.CredentialsFromContext(ctx) - dir := d.fs.newDir(creds, opts.Mode, nil) - dirVFSD := dir.VFSDentry() - if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { - dir.DecRef() + dir := d.fs.newDir(ctx, creds, opts.Mode, nil) + if err := d.OrderedChildren.Insert(name, dir); err != nil { + dir.DecRef(ctx) return nil, err } + d.TouchCMtime(ctx) d.IncLinks(1) - return dirVFSD, nil + return dir, nil } -func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) { creds := auth.CredentialsFromContext(ctx) - f := d.fs.newFile(creds, "") - fVFSD := f.VFSDentry() - if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { - f.DecRef() + f := d.fs.newFile(ctx, creds, "") + if err := d.OrderedChildren.Insert(name, f); err != nil { + f.DecRef(ctx) return nil, err } - return fVFSD, nil + d.TouchCMtime(ctx) + return f, nil } -func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { +func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) { return nil, syserror.EPERM } -func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) { return nil, syserror.EPERM } -func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) { return nil, syserror.EPERM } @@ -193,29 +206,33 @@ func (fsType) Name() string { return "kernfs" } +func (fsType) Release(ctx context.Context) {} + func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} fs.VFSFilesystem().Init(vfsObj, &fst, fs) - root := fst.rootFn(creds, fs) - return fs.VFSFilesystem(), root.VFSDentry(), nil + root := fst.rootFn(ctx, creds, fs) + var d kernfs.Dentry + d.Init(&fs.Filesystem, root) + return fs.VFSFilesystem(), d.VFSDentry(), nil } // -------------------- Remainder of the file are test cases -------------------- func TestBasic(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ - "file1": fs.newFile(creds, staticFileContent), + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "file1": fs.newFile(ctx, creds, staticFileContent), }) }) defer sys.Destroy() - sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx) } func TestMkdirGetDentry(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ - "dir1": fs.newDir(creds, 0755, nil), + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "dir1": fs.newDir(ctx, creds, 0755, nil), }) }) defer sys.Destroy() @@ -224,13 +241,13 @@ func TestMkdirGetDentry(t *testing.T) { if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) } - sys.GetDentryOrDie(pop).DecRef() + sys.GetDentryOrDie(pop).DecRef(sys.Ctx) } func TestReadStaticFile(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ - "file1": fs.newFile(creds, staticFileContent), + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "file1": fs.newFile(ctx, creds, staticFileContent), }) }) defer sys.Destroy() @@ -242,7 +259,7 @@ func TestReadStaticFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) content, err := sys.ReadToEnd(fd) if err != nil { @@ -254,9 +271,9 @@ func TestReadStaticFile(t *testing.T) { } func TestCreateNewFileInStaticDir(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ - "dir1": fs.newDir(creds, 0755, nil), + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "dir1": fs.newDir(ctx, creds, 0755, nil), }) }) defer sys.Destroy() @@ -269,7 +286,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { } // Close the file. The file should persist. - fd.DecRef() + fd.DecRef(sys.Ctx) fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, @@ -277,12 +294,12 @@ func TestCreateNewFileInStaticDir(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - fd.DecRef() + fd.DecRef(sys.Ctx) } func TestDirFDReadWrite(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, nil) + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(ctx, creds, 0755, nil) }) defer sys.Destroy() @@ -293,7 +310,7 @@ func TestDirFDReadWrite(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) // Read/Write should fail for directory FDs. if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { @@ -305,14 +322,14 @@ func TestDirFDReadWrite(t *testing.T) { } func TestDirFDIterDirents(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{ // Fill root with nodes backed by various inode implementations. - "dir1": fs.newReadonlyDir(creds, 0755, nil), - "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ - "dir3": fs.newDir(creds, 0755, nil), + "dir1": fs.newReadonlyDir(ctx, creds, 0755, nil), + "dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "dir3": fs.newDir(ctx, creds, 0755, nil), }), - "file1": fs.newFile(creds, staticFileContent), + "file1": fs.newFile(ctx, creds, staticFileContent), }) }) defer sys.Destroy() diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 2ab3f53fd..a0736c0d6 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -24,10 +24,13 @@ import ( // StaticSymlink provides an Inode implementation for symlinks that point to // a immutable target. +// +// +stateify savable type StaticSymlink struct { InodeAttrs InodeNoopRefCount InodeSymlink + InodeNoStatFS target string } @@ -35,23 +38,20 @@ type StaticSymlink struct { var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { +func NewStaticSymlink(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode { inode := &StaticSymlink{} - inode.Init(creds, devMajor, devMinor, ino, target) - - d := &Dentry{} - d.Init(inode) - return d + inode.Init(ctx, creds, devMajor, devMinor, ino, target) + return inode } // Init initializes the instance. -func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { +func (s *StaticSymlink) Init(ctx context.Context, creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { s.target = target - s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) + s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } -// Readlink implements Inode. -func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { +// Readlink implements Inode.Readlink. +func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go new file mode 100644 index 000000000..463d77d79 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -0,0 +1,113 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// syntheticDirectory implements kernfs.Inode for a directory created by +// MkdirAt(ForSyntheticMountpoint=true). +// +// +stateify savable +type syntheticDirectory struct { + InodeAlwaysValid + InodeAttrs + InodeNoStatFS + InodeNotSymlink + OrderedChildren + syntheticDirectoryRefs + + locks vfs.FileLocks +} + +var _ Inode = (*syntheticDirectory)(nil) + +func newSyntheticDirectory(ctx context.Context, creds *auth.Credentials, perm linux.FileMode) Inode { + inode := &syntheticDirectory{} + inode.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm) + return inode +} + +func (dir *syntheticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm)) + } + dir.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.S_IFDIR|perm) + dir.OrderedChildren.Init(OrderedChildrenOptions{ + Writable: true, + }) +} + +// Open implements Inode.Open. +func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{}) + if err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// NewFile implements Inode.NewFile. +func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) { + if !opts.ForSyntheticMountpoint { + return nil, syserror.EPERM + } + subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) + if err := dir.OrderedChildren.Insert(name, subdirI); err != nil { + subdirI.DecRef(ctx) + return nil, err + } + dir.TouchCMtime(ctx) + return subdirI, nil +} + +// NewLink implements Inode.NewLink. +func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) { + return nil, syserror.EPERM +} + +// DecRef implements Inode.DecRef. +func (dir *syntheticDirectory) DecRef(ctx context.Context) { + dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) }) +} + +// Keep implements Inode.Keep. This is redundant because inodes will never be +// created via Lookup and inodes are always valid. Makes sense to return true +// because these inodes are not temporary and should only be removed on RmDir. +func (dir *syntheticDirectory) Keep() bool { + return true +} diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD new file mode 100644 index 000000000..fd6c55921 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -0,0 +1,46 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "overlay", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + +go_library( + name = "overlay", + srcs = [ + "copy_up.go", + "directory.go", + "filesystem.go", + "fstree.go", + "overlay.go", + "regular_file.go", + "save_restore.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/log", + "//pkg/refsvfs2", + "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go new file mode 100644 index 000000000..4506642ca --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -0,0 +1,429 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "fmt" + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isCopiedUp() bool { + return atomic.LoadUint32(&d.copiedUp) != 0 +} + +// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok(). +// +// Preconditions: filesystem.renameMu must be locked. +func (d *dentry) copyUpLocked(ctx context.Context) error { + // Fast path. + if d.isCopiedUp() { + return nil + } + + // Attach our credentials to the context, as some VFS operations use + // credentials from context rather an take an explicit creds parameter. + ctx = auth.ContextWithCredentials(ctx, d.fs.creds) + + ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT + switch ftype { + case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR: + // Can be copied-up. + default: + // Can't be copied-up. + return syserror.EPERM + } + + // Ensure that our parent directory is copied-up. + if d.parent == nil { + // d is a filesystem root with no upper layer. + return syserror.EROFS + } + if err := d.parent.copyUpLocked(ctx); err != nil { + return err + } + + d.copyMu.Lock() + defer d.copyMu.Unlock() + if d.upperVD.Ok() { + // Raced with another call to d.copyUpLocked(). + return nil + } + if d.vfsd.IsDead() { + // Raced with deletion of d. + return syserror.ENOENT + } + + // Obtain settable timestamps from the lower layer. + vfsObj := d.fs.vfsfs.VirtualFilesystem() + oldpop := vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + } + const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME + oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{ + Mask: timestampsMask, + }) + if err != nil { + return err + } + + // Perform copy-up. + newpop := vfs.PathOperation{ + Root: d.parent.upperVD, + Start: d.parent.upperVD, + Path: fspath.Parse(d.name), + } + // Used during copy-up of memory-mapped regular files. + var mmapOpts *memmap.MMapOpts + cleanupUndoCopyUp := func() { + var err error + if ftype == linux.S_IFDIR { + err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop) + } else { + err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop) + } + if err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)) + } + if d.upperVD.Ok() { + d.upperVD.DecRef(ctx) + d.upperVD = vfs.VirtualDentry{} + } + } + switch ftype { + case linux.S_IFREG: + oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + return err + } + defer oldFD.DecRef(ctx) + newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + }) + if err != nil { + return err + } + defer newFD.DecRef(ctx) + bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size + for { + readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) + if readErr != nil && readErr != io.EOF { + cleanupUndoCopyUp() + return readErr + } + total := int64(0) + for total < readN { + writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{}) + total += writeN + if writeErr != nil { + cleanupUndoCopyUp() + return writeErr + } + } + if readErr == io.EOF { + break + } + } + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if d.wrappedMappable != nil { + // We may have memory mappings of the file on the lower layer. + // Switch to mapping the file on the upper layer instead. + mmapOpts = &memmap.MMapOpts{ + Perms: usermem.ReadWrite, + MaxPerms: usermem.ReadWrite, + } + if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil { + cleanupUndoCopyUp() + return err + } + if mmapOpts.MappingIdentity != nil { + mmapOpts.MappingIdentity.DecRef(ctx) + } + // Don't actually switch Mappables until the end of copy-up; see + // below for why. + } + if err := newFD.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = newFD.VirtualDentry() + d.upperVD.IncRef() + + case linux.S_IFDIR: + if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{ + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + }); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + case linux.S_IFLNK: + target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop) + if err != nil { + return err + } + if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + Mode: uint16(d.mode), + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + case linux.S_IFBLK, linux.S_IFCHR: + if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{ + Mode: linux.FileMode(d.mode), + DevMajor: oldStat.RdevMajor, + DevMinor: oldStat.RdevMinor, + }); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + default: + // Should have rejected this at the beginning of this function? + panic(fmt.Sprintf("unexpected file type %o", ftype)) + } + + if err := d.copyXattrsLocked(ctx); err != nil { + cleanupUndoCopyUp() + return err + } + + // Update the dentry's device and inode numbers (except for directories, + // for which these remain overlay-assigned). + if ftype != linux.S_IFDIR { + upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + }, &vfs.StatOptions{ + Mask: linux.STATX_INO, + }) + if err != nil { + cleanupUndoCopyUp() + return err + } + if upperStat.Mask&linux.STATX_INO == 0 { + cleanupUndoCopyUp() + return syserror.EREMOTE + } + atomic.StoreUint32(&d.devMajor, upperStat.DevMajor) + atomic.StoreUint32(&d.devMinor, upperStat.DevMinor) + atomic.StoreUint64(&d.ino, upperStat.Ino) + } + + if mmapOpts != nil && mmapOpts.Mappable != nil { + // Note that if mmapOpts != nil, then d.mapsMu is locked for writing + // (from the S_IFREG path above). + + // Propagate mappings of d to the new Mappable. Remember which mappings + // we added so we can remove them on failure. + upperMappable := mmapOpts.Mappable + allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) + for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + added := make(memmap.MappingsOfRange) + for m := range seg.Value() { + if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil { + for m := range added { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) + } + for mr, mappings := range allAdded { + for m := range mappings { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable) + } + } + return err + } + added[m] = struct{}{} + } + allAdded[seg.Range()] = added + } + + // Switch to the new Mappable. We do this at the end of copy-up + // because: + // + // - We need to switch Mappables (by changing d.wrappedMappable) before + // invalidating Translations from the old Mappable (to pick up + // Translations from the new one). + // + // - We need to lock d.dataMu while changing d.wrappedMappable, but + // must invalidate Translations with d.dataMu unlocked (due to lock + // ordering). + // + // - Consequently, once we unlock d.dataMu, other threads may + // immediately observe the new (copied-up) Mappable, which we want to + // delay until copy-up is guaranteed to succeed. + d.dataMu.Lock() + lowerMappable := d.wrappedMappable + d.wrappedMappable = upperMappable + d.dataMu.Unlock() + d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Remove mappings from the old Mappable. + for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + for m := range seg.Value() { + lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable) + } + } + d.lowerMappings.RemoveAll() + } + + atomic.StoreUint32(&d.copiedUp, 1) + return nil +} + +// copyXattrsLocked copies a subset of lower's extended attributes to upper. +// Attributes that configure an overlay in the lower are not copied up. +// +// Preconditions: d.copyMu must be locked for writing. +func (d *dentry) copyXattrsLocked(ctx context.Context) error { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]} + upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD} + + lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0) + if err != nil { + if err == syserror.EOPNOTSUPP { + // There are no guarantees as to the contents of lowerXattrs. + return nil + } + ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err) + return err + } + + for _, name := range lowerXattrs { + // Do not copy up overlay attributes. + if isOverlayXattr(name) { + continue + } + + value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0}) + if err != nil { + ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err) + return err + } + + if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil { + ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err) + return err + } + } + return nil +} + +// copyUpDescendantsLocked ensures that all descendants of d are copied up. +// +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error { + dirents, err := d.getDirentsLocked(ctx) + if err != nil { + return err + } + for _, dirent := range dirents { + if dirent.Name == "." || dirent.Name == ".." { + continue + } + child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds) + if err != nil { + return err + } + if err := child.copyUpLocked(ctx); err != nil { + return err + } + if child.isDir() { + child.dirMu.Lock() + err := child.copyUpDescendantsLocked(ctx, ds) + child.dirMu.Unlock() + if err != nil { + return err + } + } + } + return nil +} diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go new file mode 100644 index 000000000..df4492346 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -0,0 +1,301 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +func (d *dentry) isDir() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR +} + +// Preconditions: +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + var readdirErr error + whiteouts := make(map[string]bool) + var maybeWhiteouts []string + d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { + layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + readdirErr = err + return false + } + defer layerFD.DecRef(ctx) + + // Reuse slice allocated for maybeWhiteouts from a previous layer to + // reduce allocations. + maybeWhiteouts = maybeWhiteouts[:0] + err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name == "." || dirent.Name == ".." { + return nil + } + if _, ok := whiteouts[dirent.Name]; ok { + // This file has been whited-out in a previous layer. + return nil + } + if dirent.Type == linux.DT_CHR { + // We have to determine if this is a whiteout, which doesn't + // count against the directory's emptiness. However, we can't + // do so while holding locks held by layerFD.IterDirents(). + maybeWhiteouts = append(maybeWhiteouts, dirent.Name) + return nil + } + // Non-whiteout file in the directory prevents rmdir. + return syserror.ENOTEMPTY + })) + if err != nil { + readdirErr = err + return false + } + + for _, maybeWhiteoutName := range maybeWhiteouts { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + Path: fspath.Parse(maybeWhiteoutName), + }, &vfs.StatOptions{}) + if err != nil { + readdirErr = err + return false + } + if stat.RdevMajor != 0 || stat.RdevMinor != 0 { + // This file is a real character device, not a whiteout. + readdirErr = syserror.ENOTEMPTY + return false + } + whiteouts[maybeWhiteoutName] = isUpper + } + // Continue iteration since we haven't found any non-whiteout files in + // this directory yet. + return true + }) + return whiteouts, readdirErr +} + +// +stateify savable +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + mu sync.Mutex `state:"nosave"` + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release(ctx context.Context) { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + d := fd.dentry() + defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) + + fd.mu.Lock() + defer fd.mu.Unlock() + + if fd.dirents == nil { + ds, err := d.getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + for fd.off < int64(len(fd.dirents)) { + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + return d.getDirentsLocked(ctx) +} + +// Preconditions: +// * filesystem.renameMu must be locked. +// * d.dirMu must be locked. +// * d.isDir(). +func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) { + if d.dirents != nil { + return d.dirents, nil + } + + parent := genericParentOrSelf(d) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: d.ino, + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: parent.ino, + NextOff: 2, + }, + } + + // Merge dirents from all layers comprising this directory. + vfsObj := d.fs.vfsfs.VirtualFilesystem() + var readdirErr error + prevDirents := make(map[string]struct{}) + var maybeWhiteouts []vfs.Dirent + d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { + layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + readdirErr = err + return false + } + defer layerFD.DecRef(ctx) + + // Reuse slice allocated for maybeWhiteouts from a previous layer to + // reduce allocations. + maybeWhiteouts = maybeWhiteouts[:0] + err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name == "." || dirent.Name == ".." { + return nil + } + if _, ok := prevDirents[dirent.Name]; ok { + // This file is hidden by, or merged with, another file with + // the same name in a previous layer. + return nil + } + prevDirents[dirent.Name] = struct{}{} + if dirent.Type == linux.DT_CHR { + // We can't determine if this file is a whiteout while holding + // locks held by layerFD.IterDirents(). + maybeWhiteouts = append(maybeWhiteouts, dirent) + return nil + } + dirent.NextOff = int64(len(dirents) + 1) + dirents = append(dirents, dirent) + return nil + })) + if err != nil { + readdirErr = err + return false + } + + for _, dirent := range maybeWhiteouts { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + Path: fspath.Parse(dirent.Name), + }, &vfs.StatOptions{}) + if err != nil { + readdirErr = err + return false + } + if stat.RdevMajor == 0 && stat.RdevMinor == 0 { + // This file is a whiteout; don't emit a dirent for it. + continue + } + dirent.NextOff = int64(len(dirents) + 1) + dirents = append(dirents, dirent) + } + return true + }) + if readdirErr != nil { + return nil, readdirErr + } + + // Cache dirents for future directoryFDs. + d.dirents = dirents + return dirents, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} + +// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper +// layer, if there is one. The lower layer doesn't need to sync because it +// never changes. +func (fd *directoryFD) Sync(ctx context.Context) error { + d := fd.dentry() + if !d.isCopiedUp() { + return nil + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + } + upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) + if err != nil { + return err + } + err = upperFD.Sync(ctx) + upperFD.DecRef(ctx) + return err +} diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go new file mode 100644 index 000000000..10161a08d --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -0,0 +1,1780 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "fmt" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs +// attributes. +// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX +const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay." + +// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for +// opaque directories. +// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE +const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque" + +func isWhiteout(stat *linux.Statx) bool { + return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0 +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + if fs.opts.UpperRoot.Ok() { + return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx) + } + return nil +} + +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls +// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may have a reference count of zero, and which therefore +// should be dropped once traversal is complete, are appended to ds. +// +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +// * !rp.Done(). +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + child, err := fs.getChildLocked(ctx, d, name, ds) + if err != nil { + return nil, err + } + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { + return nil, err + } + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// Preconditions: +// * fs.renameMu must be locked. +// * d.dirMu must be locked. +func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { + if child, ok := parent.children[name]; ok { + return child, nil + } + child, err := fs.lookupLocked(ctx, parent, name) + if err != nil { + return nil, err + } + if parent.children == nil { + parent.children = make(map[string]*dentry) + } + parent.children[name] = child + // child's refcount is initially 0, so it may be dropped after traversal. + *ds = appendDentry(*ds, child) + return child, nil +} + +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { + childPath := fspath.Parse(name) + child := fs.newDentry() + existsOnAnyLayer := false + var lookupErr error + + vfsObj := fs.vfsfs.VirtualFilesystem() + parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { + childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parentVD, + Start: parentVD, + Path: childPath, + }, &vfs.GetDentryOptions{}) + if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + // The file doesn't exist on this layer. Proceed to the next one. + return true + } + if err != nil { + lookupErr = err + return false + } + defer childVD.DecRef(ctx) + + mask := uint32(linux.STATX_TYPE) + if !existsOnAnyLayer { + // Mode, UID, GID, and (for non-directories) inode number come from + // the topmost layer on which the file exists. + mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + } + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + lookupErr = err + return false + } + if stat.Mask&mask != mask { + lookupErr = syserror.EREMOTE + return false + } + + if isWhiteout(&stat) { + // This is a whiteout, so it "doesn't exist" on this layer, and + // layers below this one are ignored. + return false + } + isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR + if existsOnAnyLayer && !isDir { + // Directories are not merged with non-directory files from lower + // layers; instead, layers including and below the first + // non-directory file are ignored. (This file must be a directory + // on previous layers, since lower layers aren't searched for + // non-directory files.) + return false + } + + // Update child to include this layer. + childVD.IncRef() + if isUpper { + child.upperVD = childVD + child.copiedUp = 1 + } else { + child.lowerVDs = append(child.lowerVDs, childVD) + } + if !existsOnAnyLayer { + existsOnAnyLayer = true + child.mode = uint32(stat.Mode) + child.uid = stat.UID + child.gid = stat.GID + child.devMajor = stat.DevMajor + child.devMinor = stat.DevMinor + child.ino = stat.Ino + } + + // For non-directory files, only the topmost layer that contains a file + // matters. + if !isDir { + return false + } + + // Directories are merged with directories from lower layers if they + // are not explicitly opaque. + opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.GetXattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Size: 1, + }) + return !(err == nil && opaqueVal == "y") + }) + + if lookupErr != nil { + child.destroyLocked(ctx) + return nil, lookupErr + } + if !existsOnAnyLayer { + child.destroyLocked(ctx) + return nil, syserror.ENOENT + } + + // Device and inode numbers were copied from the topmost layer above; + // override them if necessary. + if child.isDir() { + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = fs.dirDevMinor + child.ino = fs.newDirIno() + } else if !child.upperVD.Ok() { + childDevMinor, err := fs.getLowerDevMinor(child.devMajor, child.devMinor) + if err != nil { + ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err) + child.destroyLocked(ctx) + return nil, err + } + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = childDevMinor + } + + parent.IncRef() + child.parent = parent + child.name = name + return child, nil +} + +// lookupLayerLocked is similar to lookupLocked, but only returns information +// about the file rather than a dentry. +// +// Preconditions: +// * fs.renameMu must be locked. +// * parent.dirMu must be locked. +func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) { + childPath := fspath.Parse(name) + lookupLayer := lookupLayerNone + var lookupErr error + + parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { + stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parentVD, + Start: parentVD, + Path: childPath, + }, &vfs.StatOptions{ + Mask: linux.STATX_TYPE, + }) + if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + // The file doesn't exist on this layer. Proceed to the next + // one. + return true + } + if err != nil { + lookupErr = err + return false + } + if stat.Mask&linux.STATX_TYPE == 0 { + // Linux's overlayfs tends to return EREMOTE in cases where a file + // is unusable for reasons that are not better captured by another + // errno. + lookupErr = syserror.EREMOTE + return false + } + if isWhiteout(&stat) { + // This is a whiteout, so it "doesn't exist" on this layer, and + // layers below this one are ignored. + if isUpper { + lookupLayer = lookupLayerUpperWhiteout + } + return false + } + // The file exists; we can stop searching. + if isUpper { + lookupLayer = lookupLayerUpper + } else { + lookupLayer = lookupLayerLower + } + return false + }) + + return lookupLayer, lookupErr +} + +type lookupLayer int + +const ( + // lookupLayerNone indicates that no file exists at the given path on the + // upper layer, and is either whited out or does not exist on lower layers. + // Therefore, the file does not exist in the overlay filesystem, and file + // creation may proceed normally (if an upper layer exists). + lookupLayerNone lookupLayer = iota + + // lookupLayerLower indicates that no file exists at the given path on the + // upper layer, but exists on a lower layer. Therefore, the file exists in + // the overlay filesystem, but must be copied-up before mutation. + lookupLayerLower + + // lookupLayerUpper indicates that a non-whiteout file exists at the given + // path on the upper layer. Therefore, the file exists in the overlay + // filesystem, and is already copied-up. + lookupLayerUpper + + // lookupLayerUpperWhiteout indicates that a whiteout exists at the given + // path on the upper layer. Therefore, the file does not exist in the + // overlay filesystem, and file creation must remove the whiteout before + // proceeding. + lookupLayerUpperWhiteout +) + +func (ll lookupLayer) existsInOverlay() bool { + return ll == lookupLayerLower || ll == lookupLayerUpper +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: +// * fs.renameMu must be locked. +// * !rp.Done(). +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// Preconditions: +// * !rp.Done(). +// * For the final path component in rp, !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + if parent.vfsd.IsDead() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Determine if a file already exists at name. + if _, ok := parent.children[name]; ok { + return syserror.EEXIST + } + childLayer, err := fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if childLayer.existsInOverlay() { + return syserror.EEXIST + } + + // Ensure that the parent directory is copied-up so that we can create the + // new file in the upper layer. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + // Finally create the new file. + if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil { + return err + } + + parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) + return nil +} + +// Preconditions: pop's parent directory has been copied up. +func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error { + return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{ + Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0 + // DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV + }) +} + +func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) { + if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)) + } +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + layerVD := d.topLayer() + return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &opts) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + old := vd.Dentry().Impl().(*dentry) + if old.isDir() { + return syserror.EPERM + } + if err := old.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := fs.vfsfs.VirtualFilesystem() + newpop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { + return err + } + } + if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: old.upperVD, + Start: old.upperVD, + }, &newpop); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) + } + return err + } + old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */) + return nil + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + if haveUpperWhiteout { + // There may be directories on lower layers (previously hidden by + // the whiteout) that the new directory should not be merged with. + // Mark it opaque to prevent merging. + if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Value: "y", + }); err != nil { + if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)) + } else { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + } + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + // Disallow attempts to create whiteouts. + if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { + return syserror.EPERM + } + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + return nil + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + mayWrite := vfs.AccessTypesForOpenFlags(&opts).MayWrite() + + var ds *[]*dentry + fs.renameMu.RLock() + unlocked := false + unlock := func() { + if !unlocked { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + unlocked = true + } + } + defer unlock() + + start := rp.Start().Impl().(*dentry) + if rp.Done() { + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + if start.isRegularFile() && mayWrite { + if err := start.copyUpLocked(ctx); err != nil { + return nil, err + } + } + start.IncRef() + defer start.DecRef(ctx) + unlock() + return start.openCopiedUp(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err == syserror.ENOENT && mayCreate { + fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds) + parent.dirMu.Unlock() + return fd, err + } + parent.dirMu.Unlock() + if err != nil { + return nil, err + } + // Open existing child or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + if rp.MustBeDir() && !child.isDir() { + return nil, syserror.ENOTDIR + } + if child.isRegularFile() && mayWrite { + if err := child.copyUpLocked(ctx); err != nil { + return nil, err + } + } + child.IncRef() + defer child.DecRef(ctx) + unlock() + return child.openCopiedUp(ctx, rp, &opts) +} + +// Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has +// been copied up. +func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + mnt := rp.Mount() + + // Directory FDs open FDs from each layer when directory entries are read, + // so they don't require opening an FD from d.topLayer() up front. + ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT + if ftype == linux.S_IFDIR { + // Can't open directories with O_CREAT. + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats.MayWrite() { + return nil, syserror.EISDIR + } + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fd := &directoryFD{} + fd.LockFD.Init(&d.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil + } + + layerVD, isUpper := d.topLayerInfo() + layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, opts) + if err != nil { + return nil, err + } + if ftype != linux.S_IFREG { + return layerFD, nil + } + layerFlags := layerFD.StatusFlags() + fd := ®ularFileFD{ + copiedUp: isUpper, + cachedFD: layerFD, + cachedFlags: layerFlags, + } + fd.LockFD.Init(&d.locks) + layerFDOpts := layerFD.Options() + if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { + layerFD.DecRef(ctx) + return nil, err + } + return &fd.vfsfd, nil +} + +// Preconditions: +// * parent.dirMu must be locked. +// * parent does not already contain a child named rp.Component(). +func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { + creds := rp.Credentials() + if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil { + return nil, err + } + if parent.vfsd.IsDead() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + if err := parent.copyUpLocked(ctx); err != nil { + return nil, err + } + + vfsObj := fs.vfsfs.VirtualFilesystem() + childName := rp.Component() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + // We don't know if a whiteout exists on the upper layer; speculatively + // unlink it. + // + // TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do + // know whether a whiteout exists. + var haveUpperWhiteout bool + switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err { + case nil: + haveUpperWhiteout = true + case syserror.ENOENT: + haveUpperWhiteout = false + default: + return nil, err + } + // Create the file on the upper layer, and get an FD representing it. + upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{ + Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL, + Mode: opts.Mode, + }) + if err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Change the file's owner to the caller. We can't use upperFD.SetStat() + // because it will pick up creds from ctx. + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Re-lookup to get a dentry representing the new file, which is needed for + // the returned FD. + child, err := fs.getChildLocked(ctx, parent, childName, ds) + if err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Finally construct the overlay FD. + upperFlags := upperFD.StatusFlags() + fd := ®ularFileFD{ + copiedUp: true, + cachedFD: upperFD, + cachedFlags: upperFlags, + } + fd.LockFD.Init(&child.locks) + upperFDOpts := upperFD.Options() + if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { + upperFD.DecRef(ctx) + // Don't bother with cleanup; the file was created successfully, we + // just can't open it anymore for some reason. + return nil, err + } + parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */) + return &fd.vfsfd, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + layerVD := d.topLayer() + return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckDrop(ctx, &ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + creds := rp.Credentials() + if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds) + if err != nil { + return err + } + if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil { + return err + } + if renamed.isDir() { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.vfsd.IsDead() { + return syserror.ENOENT + } + replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName) + if err != nil { + return err + } + var ( + replaced *dentry + replacedVFSD *vfs.Dentry + whiteouts map[string]bool + ) + if replacedLayer.existsInOverlay() { + replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds) + if err != nil { + return err + } + replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + if genericIsAncestorDentry(replaced, renamed) { + return syserror.ENOTEMPTY + } + replaced.dirMu.Lock() + defer replaced.dirMu.Unlock() + whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx) + if err != nil { + return err + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + + // renamed and oldParent need to be copied-up before they're renamed on the + // upper layer. + if err := renamed.copyUpLocked(ctx); err != nil { + return err + } + // If renamed is a directory, all of its descendants need to be copied-up + // before they're renamed on the upper layer. + if renamed.isDir() { + if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil { + return err + } + } + // newParent must be copied-up before it can contain renamed on the upper + // layer. + if err := newParent.copyUpLocked(ctx); err != nil { + return err + } + // If replaced exists, it doesn't need to be copied-up, but we do need to + // serialize with copy-up. Holding renameMu for writing should be + // sufficient, but out of an abundance of caution... + if replaced != nil { + replaced.copyMu.RLock() + defer replaced.copyMu.RUnlock() + } + + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef(ctx) + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { + return err + } + + newpop := vfs.PathOperation{ + Root: newParent.upperVD, + Start: newParent.upperVD, + Path: fspath.Parse(newName), + } + + needRecreateWhiteouts := false + cleanupRecreateWhiteouts := func() { + if !needRecreateWhiteouts { + return + } + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{ + Root: replaced.upperVD, + Start: replaced.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil && err != syserror.EEXIST { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err)) + } + } + } + if renamed.isDir() { + if replacedLayer == lookupLayerUpper { + // Remove whiteouts from the directory being replaced. + needRecreateWhiteouts = true + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: replaced.upperVD, + Start: replaced.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + } else if replacedLayer == lookupLayerUpperWhiteout { + // We need to explicitly remove the whiteout since otherwise rename + // on the upper layer will fail with ENOTDIR. + if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } + } + + // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a + // regular rename and create the whiteout at the origin manually. Unlike + // RENAME_WHITEOUT, this isn't atomic with respect to other users of the + // upper filesystem, but this is already the case for virtually all other + // overlay filesystem operations too. + oldpop := vfs.PathOperation{ + Root: oldParent.upperVD, + Start: oldParent.upperVD, + Path: fspath.Parse(oldName), + } + if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + + // Below this point, the renamed dentry is now at newpop, and anything we + // replaced is gone forever. Commit the rename, update the overlay + // filesystem tree, and abandon attempts to recover from errors. + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) + delete(oldParent.children, oldName) + if replaced != nil { + ds = appendDentry(ds, replaced) + } + if oldParent != newParent { + newParent.dirents = nil + // This can't drop the last reference on oldParent because one is held + // by oldParentVD, so lock recursion is impossible. + oldParent.DecRef(ctx) + ds = appendDentry(ds, oldParent) + newParent.IncRef() + renamed.parent = newParent + } + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + oldParent.dirents = nil + + if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err)) + } + if renamed.isDir() { + if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Value: "y", + }); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err)) + } + } + + vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + name := rp.Component() + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef(ctx) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Ensure that parent is copied-up before potentially holding child.copyMu + // below. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + // Unlike UnlinkAt, we need a dentry representing the child directory being + // removed in order to verify that it's empty. + child, err := fs.getChildLocked(ctx, parent, name, &ds) + if err != nil { + return err + } + if !child.isDir() { + return syserror.ENOTDIR + } + if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil { + return err + } + child.dirMu.Lock() + defer child.dirMu.Unlock() + whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx) + if err != nil { + return err + } + child.copyMu.RLock() + defer child.copyMu.RUnlock() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(name), + } + if child.upperVD.Ok() { + cleanupRecreateWhiteouts := func() { + if !child.upperVD.Ok() { + return + } + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{ + Root: child.upperVD, + Start: child.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil && err != syserror.EEXIST { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)) + } + } + } + // Remove existing whiteouts on the upper layer. + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.upperVD, + Start: child.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + } + // Remove the existing directory on the upper layer. + if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + } + if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { + // Don't attempt to recover from this: the original directory is + // already gone, so any dentries representing it are invalid, and + // creating a new directory won't undo that. + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)) + } + + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) + delete(parent.children, name) + ds = appendDentry(ds, child) + parent.dirents = nil + parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + return err + } + err = d.setStatLocked(ctx, rp, opts) + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent) + } + return nil +} + +// Precondition: d.fs.renameMu must be held for reading. +func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + // Changes to d's attributes are serialized by d.copyMu. + d.copyMu.Lock() + defer d.copyMu.Unlock() + if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + }, &opts); err != nil { + return err + } + d.updateAfterSetStatLocked(&opts) + return nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + + var stat linux.Statx + if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { + layerVD := d.topLayer() + stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.StatOptions{ + Mask: layerMask, + Sync: opts.Sync, + }) + if err != nil { + return linux.Statx{}, err + } + } + d.statInternalTo(ctx, &opts, &stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + return fs.statFS(ctx) +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + return nil + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef(ctx) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Ensure that parent is copied-up before potentially holding child.copyMu + // below. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + parentMode := atomic.LoadUint32(&parent.mode) + child := parent.children[name] + var childLayer lookupLayer + if child == nil { + if parentMode&linux.S_ISVTX != 0 { + // If the parent's sticky bit is set, we need a child dentry to get + // its owner. + child, err = fs.getChildLocked(ctx, parent, name, &ds) + if err != nil { + return err + } + } else { + // Determine if the file being unlinked actually exists. Holding + // parent.dirMu prevents a dentry from being instantiated for the file, + // which in turn prevents it from being copied-up, so this result is + // stable. + childLayer, err = fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if !childLayer.existsInOverlay() { + return syserror.ENOENT + } + } + } + if child != nil { + if child.isDir() { + return syserror.EISDIR + } + if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil { + return err + } + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + // Hold child.copyMu to prevent it from being copied-up during + // deletion. + child.copyMu.RLock() + defer child.copyMu.RUnlock() + if child.upperVD.Ok() { + childLayer = lookupLayerUpper + } else { + childLayer = lookupLayerLower + } + } + + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(name), + } + if childLayer == lookupLayerUpper { + // Remove the existing file on the upper layer. + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } + } + if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { + panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)) + } + + var cw *vfs.Watches + if child != nil { + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) + delete(parent.children, name) + ds = appendDentry(ds, child) + cw = &child.watches + } + vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) + parent.dirents = nil + return nil +} + +// isOverlayXattr returns whether the given extended attribute configures the +// overlay. +func isOverlayXattr(name string) bool { + return strings.HasPrefix(name, _OVL_XATTR_PREFIX) +} + +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + + return fs.listXattr(ctx, d, size) +} + +func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + top := d.topLayer() + names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size) + if err != nil { + return nil, err + } + + // Filter out all overlay attributes. + n := 0 + for _, name := range names { + if !isOverlayXattr(name) { + names[n] = name + n++ + } + } + return names[:n], err +} + +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + + return fs.getXattr(ctx, d, rp.Credentials(), &opts) +} + +func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { + return "", err + } + + // Return EOPNOTSUPP when fetching an overlay attribute. + // See fs/overlayfs/super.c:ovl_own_xattr_get(). + if isOverlayXattr(opts.Name) { + return "", syserror.EOPNOTSUPP + } + + // Analogous to fs/overlayfs/super.c:ovl_other_xattr_get(). + vfsObj := d.fs.vfsfs.VirtualFilesystem() + top := d.topLayer() + return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts) +} + +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + return err + } + + err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts) + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent) + return nil +} + +// Precondition: fs.renameMu must be locked. +func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { + if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { + return err + } + + // Return EOPNOTSUPP when setting an overlay attribute. + // See fs/overlayfs/super.c:ovl_own_xattr_set(). + if isOverlayXattr(opts.Name) { + return syserror.EOPNOTSUPP + } + + // Analogous to fs/overlayfs/super.c:ovl_other_xattr_set(). + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts) +} + +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + return err + } + + err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name) + fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent) + return nil +} + +// Precondition: fs.renameMu must be locked. +func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error { + if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { + return err + } + + // Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute. + // Linux passes the remove request to xattr_handler->set. + // See fs/xattr.c:vfs_removexattr(). + if isOverlayXattr(name) { + return syserror.EOPNOTSUPP + } + + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name) +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) +} diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go new file mode 100644 index 000000000..c812f0a70 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -0,0 +1,798 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package overlay provides an overlay filesystem implementation, which +// synthesizes a filesystem by composing one or more immutable filesystems +// ("lower layers") with an optional mutable filesystem ("upper layer"). +// +// Lock order: +// +// directoryFD.mu / regularFileFD.mu +// filesystem.renameMu +// dentry.dirMu +// dentry.copyMu +// filesystem.devMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.dataMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. +package overlay + +import ( + "fmt" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/refsvfs2" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "overlay" + +// FilesystemType implements vfs.FilesystemType. +// +// +stateify savable +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// Release implements FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to +// FilesystemType.GetFilesystem. +// +// +stateify savable +type FilesystemOptions struct { + // Callers passing FilesystemOptions to + // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that + // the vfs.Mounts comprising the layers of the overlay filesystem do not + // contain submounts. + + // If UpperRoot.Ok(), it is the root of the writable upper layer of the + // overlay. + UpperRoot vfs.VirtualDentry + + // LowerRoots contains the roots of the immutable lower layers of the + // overlay. LowerRoots is immutable. + LowerRoots []vfs.VirtualDentry +} + +// filesystem implements vfs.FilesystemImpl. +// +// +stateify savable +type filesystem struct { + vfsfs vfs.Filesystem + + // Immutable options. + opts FilesystemOptions + + // creds is a copy of the filesystem's creator's credentials, which are + // used for accesses to the filesystem's layers. creds is immutable. + creds *auth.Credentials + + // dirDevMinor is the device minor number used for directories. dirDevMinor + // is immutable. + dirDevMinor uint32 + + // lowerDevMinors maps device numbers from lower layer filesystems to + // device minor numbers assigned to non-directory files originating from + // that filesystem. (This remapping is necessary for lower layers because a + // file on a lower layer, and that same file on an overlay, are + // distinguishable because they will diverge after copy-up; this isn't true + // for non-directory files already on the upper layer.) lowerDevMinors is + // protected by devMu. + devMu sync.Mutex `state:"nosave"` + lowerDevMinors map[layerDevNumber]uint32 + + // renameMu synchronizes renaming with non-renaming operations in order to + // ensure consistent lock ordering between dentry.dirMu in different + // dentries. + renameMu sync.RWMutex `state:"nosave"` + + // lastDirIno is the last inode number assigned to a directory. lastDirIno + // is accessed using atomic memory operations. + lastDirIno uint64 +} + +// +stateify savable +type layerDevNumber struct { + major uint32 + minor uint32 +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mopts := vfs.GenericParseMountOptions(opts.Data) + fsoptsRaw := opts.InternalData + fsopts, ok := fsoptsRaw.(FilesystemOptions) + if fsoptsRaw != nil && !ok { + ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) + return nil, nil, syserror.EINVAL + } + vfsroot := vfs.RootFromContext(ctx) + if vfsroot.Ok() { + defer vfsroot.DecRef(ctx) + } + + if upperPathname, ok := mopts["upperdir"]; ok { + if fsopts.UpperRoot.Ok() { + ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") + return nil, nil, syserror.EINVAL + } + delete(mopts, "upperdir") + // Linux overlayfs also requires a workdir when upperdir is + // specified; we don't, so silently ignore this option. + delete(mopts, "workdir") + upperPath := fspath.Parse(upperPathname) + if !upperPath.Absolute { + ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) + return nil, nil, syserror.EINVAL + } + upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ + Root: vfsroot, + Start: vfsroot, + Path: upperPath, + FollowFinalSymlink: true, + }, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) + return nil, nil, err + } + privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) + upperRoot.DecRef(ctx) + if err != nil { + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) + return nil, nil, err + } + defer privateUpperRoot.DecRef(ctx) + fsopts.UpperRoot = privateUpperRoot + } + + if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { + if len(fsopts.LowerRoots) != 0 { + ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") + return nil, nil, syserror.EINVAL + } + delete(mopts, "lowerdir") + lowerPathnames := strings.Split(lowerPathnamesStr, ":") + for _, lowerPathname := range lowerPathnames { + lowerPath := fspath.Parse(lowerPathname) + if !lowerPath.Absolute { + ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) + return nil, nil, syserror.EINVAL + } + lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ + Root: vfsroot, + Start: vfsroot, + Path: lowerPath, + FollowFinalSymlink: true, + }, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) + return nil, nil, err + } + privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) + lowerRoot.DecRef(ctx) + if err != nil { + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) + return nil, nil, err + } + defer privateLowerRoot.DecRef(ctx) + fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) + } + } + + if len(mopts) != 0 { + ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + if len(fsopts.LowerRoots) == 0 { + ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") + return nil, nil, syserror.EINVAL + } + if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { + ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") + return nil, nil, syserror.EINVAL + } + const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK + if len(fsopts.LowerRoots) > maxLowerLayers { + ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) + return nil, nil, syserror.EINVAL + } + + // Allocate dirDevMinor. lowerDevMinors are allocated dynamically. + dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + // Take extra references held by the filesystem. + if fsopts.UpperRoot.Ok() { + fsopts.UpperRoot.IncRef() + } + for _, lowerRoot := range fsopts.LowerRoots { + lowerRoot.IncRef() + } + + fs := &filesystem{ + opts: fsopts, + creds: creds.Fork(), + dirDevMinor: dirDevMinor, + lowerDevMinors: make(map[layerDevNumber]uint32), + } + fs.vfsfs.Init(vfsObj, &fstype, fs) + + // Construct the root dentry. + root := fs.newDentry() + root.refs = 1 + if fs.opts.UpperRoot.Ok() { + fs.opts.UpperRoot.IncRef() + root.copiedUp = 1 + root.upperVD = fs.opts.UpperRoot + } + for _, lowerRoot := range fs.opts.LowerRoots { + lowerRoot.IncRef() + root.lowerVDs = append(root.lowerVDs, lowerRoot) + } + rootTopVD := root.topLayer() + // Get metadata from the topmost layer. See fs.lookupLocked(). + const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: rootTopVD, + Start: rootTopVD, + }, &vfs.StatOptions{ + Mask: rootStatMask, + }) + if err != nil { + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) + return nil, nil, err + } + if rootStat.Mask&rootStatMask != rootStatMask { + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) + return nil, nil, syserror.EREMOTE + } + if isWhiteout(&rootStat) { + ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) + return nil, nil, syserror.EINVAL + } + root.mode = uint32(rootStat.Mode) + root.uid = rootStat.UID + root.gid = rootStat.GID + if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { + root.devMajor = linux.UNNAMED_MAJOR + root.devMinor = fs.dirDevMinor + root.ino = fs.newDirIno() + } else if !root.upperVD.Ok() { + root.devMajor = linux.UNNAMED_MAJOR + rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor) + if err != nil { + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) + return nil, nil, err + } + root.devMinor = rootDevMinor + root.ino = rootStat.Ino + } else { + root.devMajor = rootStat.DevMajor + root.devMinor = rootStat.DevMinor + root.ino = rootStat.Ino + } + + return &fs.vfsfs, &root.vfsd, nil +} + +// clonePrivateMount creates a non-recursive bind mount rooted at vd, not +// associated with any MountNamespace, and returns the root of the new mount. +// (This is required to ensure that each layer of an overlay comprises only a +// single mount, and therefore can't cross into e.g. the overlay filesystem +// itself, risking lock recursion.) A reference is held on the returned +// VirtualDentry. +func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { + oldmnt := vd.Mount() + opts := oldmnt.Options() + if forceReadOnly { + opts.ReadOnly = true + } + newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) + if err != nil { + return vfs.VirtualDentry{}, err + } + // Take a reference on the dentry which will be owned by the returned + // VirtualDentry. + d := vd.Dentry() + d.IncRef() + return vfs.MakeVirtualDentry(newmnt, d), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + vfsObj := fs.vfsfs.VirtualFilesystem() + vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) + for _, lowerDevMinor := range fs.lowerDevMinors { + vfsObj.PutAnonBlockDevMinor(lowerDevMinor) + } + if fs.opts.UpperRoot.Ok() { + fs.opts.UpperRoot.DecRef(ctx) + } + for _, lowerRoot := range fs.opts.LowerRoots { + lowerRoot.DecRef(ctx) + } +} + +func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { + // Always statfs the root of the topmost layer. Compare Linux's + // fs/overlayfs/super.c:ovl_statfs(). + var rootVD vfs.VirtualDentry + if fs.opts.UpperRoot.Ok() { + rootVD = fs.opts.UpperRoot + } else { + rootVD = fs.opts.LowerRoots[0] + } + fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ + Root: rootVD, + Start: rootVD, + }) + if err != nil { + return linux.Statfs{}, err + } + fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC + return fsstat, nil +} + +func (fs *filesystem) newDirIno() uint64 { + return atomic.AddUint64(&fs.lastDirIno, 1) +} + +func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) { + fs.devMu.Lock() + defer fs.devMu.Unlock() + orig := layerDevNumber{layerMajor, layerMinor} + if minor, ok := fs.lowerDevMinors[orig]; ok { + return minor, nil + } + minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() + if err != nil { + return 0, err + } + fs.lowerDevMinors[orig] = minor + return minor, nil +} + +// dentry implements vfs.DentryImpl. +// +// +stateify savable +type dentry struct { + vfsd vfs.Dentry + + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // mode, uid, and gid are the file mode, owner, and group of the file in + // the topmost layer (and therefore the overlay file as well), and are used + // for permission checks on this dentry. These fields are protected by + // copyMu and accessed using atomic memory operations. + mode uint32 + uid uint32 + gid uint32 + + // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and + // 0 otherwise. copiedUp is accessed using atomic memory operations. + copiedUp uint32 + + // parent is the dentry corresponding to this dentry's parent directory. + // name is this dentry's name in parent. If this dentry is a filesystem + // root, parent is nil and name is the empty string. parent and name are + // protected by fs.renameMu. + parent *dentry + name string + + // If this dentry represents a directory, children maps the names of + // children for which dentries have been instantiated to those dentries, + // and dirents (if not nil) is a cache of dirents as returned by + // directoryFDs representing this directory. children is protected by + // dirMu. + dirMu sync.Mutex `state:"nosave"` + children map[string]*dentry + dirents []vfs.Dirent + + // upperVD and lowerVDs are the files from the overlay filesystem's layers + // that comprise the file on the overlay filesystem. + // + // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. + // be copied up) with copyMu locked for writing; otherwise, it is + // immutable. lowerVDs is always immutable. + copyMu sync.RWMutex `state:"nosave"` + upperVD vfs.VirtualDentry + lowerVDs []vfs.VirtualDentry + + // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= + // len(inlineLowerVDs). + inlineLowerVDs [1]vfs.VirtualDentry + + // devMajor, devMinor, and ino are the device major/minor and inode numbers + // used by this dentry. These fields are protected by copyMu and accessed + // using atomic memory operations. + devMajor uint32 + devMinor uint32 + ino uint64 + + // If this dentry represents a regular file, then: + // + // - mapsMu is used to synchronize between copy-up and memmap.Mappable + // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. + // + // - dataMu is used to synchronize between copy-up and + // dentry.(memmap.Mappable).Translate. + // + // - lowerMappings tracks memory mappings of the file. lowerMappings is + // used to invalidate mappings of the lower layer when the file is copied + // up to ensure that they remain coherent with subsequent writes to the + // file. (Note that, as of this writing, Linux overlayfs does not do this; + // this feature is a gVisor extension.) lowerMappings is protected by + // mapsMu. + // + // - If this dentry is copied-up, then wrappedMappable is the Mappable + // obtained from a call to the current top layer's + // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil + // (from a call to regularFileFD.ensureMappable()), it cannot become nil. + // wrappedMappable is protected by mapsMu and dataMu. + // + // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is + // accessed using atomic memory operations. + mapsMu sync.Mutex `state:"nosave"` + lowerMappings memmap.MappingSet + dataMu sync.RWMutex `state:"nosave"` + wrappedMappable memmap.Mappable + isMappable uint32 + + locks vfs.FileLocks + + // watches is the set of inotify watches on the file repesented by this dentry. + // + // Note that hard links to the same file will not share the same set of + // watches, due to the fact that we do not have inode structures in this + // overlay implementation. + watches vfs.Watches +} + +// newDentry creates a new dentry. The dentry initially has no references; it +// is the caller's responsibility to set the dentry's reference count and/or +// call dentry.destroy() as appropriate. The dentry is initially invalid in +// that it contains no layers; the caller is responsible for setting them. +func (fs *filesystem) newDentry() *dentry { + d := &dentry{ + fs: fs, + } + d.lowerVDs = d.inlineLowerVDs[:0] + d.vfsd.Init(d) + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Register(d, "overlay.dentry") + } + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkDropLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef(ctx context.Context) { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked(ctx) + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("overlay.dentry.DecRef() called without holding a reference") + } +} + +// checkDropLocked should be called after d's reference count becomes 0 or it +// becomes deleted. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkDropLocked(ctx context.Context) { + // Dentries with a positive reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) Dentries with a + // negative reference count have already been destroyed. + if atomic.LoadInt64(&d.refs) != 0 { + return + } + + // Make sure that we do not lose watches on dentries that have not been + // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so + // d.vfsd.IsDead() indicates that d was deleted. + if !d.vfsd.IsDead() && d.watches.Size() > 0 { + return + } + + // Refs is still zero; destroy it. + d.destroyLocked(ctx) + return +} + +// destroyLocked destroys the dentry. +// +// Preconditions: +// * d.fs.renameMu must be locked for writing. +// * d.refs == 0. +func (d *dentry) destroyLocked(ctx context.Context) { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("overlay.dentry.destroyLocked() called on already destroyed dentry") + default: + panic("overlay.dentry.destroyLocked() called with references on the dentry") + } + + if d.upperVD.Ok() { + d.upperVD.DecRef(ctx) + } + for _, lowerVD := range d.lowerVDs { + lowerVD.DecRef(ctx) + } + + d.watches.HandleDeletion(ctx) + + if d.parent != nil { + d.parent.dirMu.Lock() + if !d.vfsd.IsDead() { + delete(d.parent.children, d.name) + } + d.parent.dirMu.Unlock() + // Drop the reference held by d on its parent without recursively + // locking d.fs.renameMu. + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkDropLocked(ctx) + } else if refs < 0 { + panic("overlay.dentry.DecRef() called without holding a reference") + } + } + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Unregister(d, "overlay.dentry") + } +} + +// LeakMessage implements refsvfs2.CheckedObject.LeakMessage. +func (d *dentry) LeakMessage() string { + return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs)) +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { + if d.isDir() { + events |= linux.IN_ISDIR + } + + // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates + // that d was deleted. + deleted := d.vfsd.IsDead() + + d.fs.renameMu.RLock() + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) + } + d.watches.Notify(ctx, "", events, cookie, et, deleted) + d.fs.renameMu.RUnlock() +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + return &d.watches +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +func (d *dentry) OnZeroWatches(ctx context.Context) { + if atomic.LoadInt64(&d.refs) == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked(ctx) + d.fs.renameMu.Unlock() + } +} + +// iterLayers invokes yield on each layer comprising d, from top to bottom. If +// any call to yield returns false, iterLayer stops iteration. +func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { + if d.isCopiedUp() { + if !yield(d.upperVD, true) { + return + } + } + for _, lowerVD := range d.lowerVDs { + if !yield(lowerVD, false) { + return + } + } +} + +func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { + if d.isCopiedUp() { + return d.upperVD, true + } + return d.lowerVDs[0], false +} + +func (d *dentry) topLayer() vfs.VirtualDentry { + vd, _ := d.topLayerInfo() + return vd +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + kuid := auth.KUID(atomic.LoadUint32(&d.uid)) + kgid := auth.KGID(atomic.LoadUint32(&d.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err + } + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) +} + +// statInternalMask is the set of stat fields that is set by +// dentry.statInternalTo(). +const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + +// statInternalTo writes fields to stat that are stored in d, and therefore do +// not requiring invoking StatAt on the overlay's layers. +func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { + stat.Mask |= statInternalMask + if d.isDir() { + // Linux sets nlink to 1 for merged directories + // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is + // correct more often ("." and the directory's entry in its parent), + // and some of our tests expect this. + stat.Nlink = 2 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = atomic.LoadUint64(&d.ino) + stat.DevMajor = atomic.LoadUint32(&d.devMajor) + stat.DevMinor = atomic.LoadUint32(&d.devMinor) +} + +// Preconditions: d.copyMu must be locked for writing. +func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { + if opts.Stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT)) + } + if opts.Stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, opts.Stat.UID) + } + if opts.Stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, opts.Stat.GID) + } +} + +// fileDescription is embedded by overlay implementations of +// vfs.FileDescriptionImpl. +// +// +stateify savable +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.filesystem().listXattr(ctx, fd.dentry(), size) +} + +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) +} + +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { + fs := fd.filesystem() + d := fd.dentry() + + fs.renameMu.RLock() + err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) + fs.renameMu.RUnlock() + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { + fs := fd.filesystem() + d := fd.dentry() + + fs.renameMu.RLock() + err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) + fs.renameMu.RUnlock() + if err != nil { + return err + } + + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go new file mode 100644 index 000000000..2b89a7a6d --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -0,0 +1,456 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func (d *dentry) isRegularFile() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG +} + +func (d *dentry) isSymlink() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK +} + +func (d *dentry) readlink(ctx context.Context) (string, error) { + layerVD := d.topLayer() + return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }) +} + +// +stateify savable +type regularFileFD struct { + fileDescription + + // If copiedUp is false, cachedFD represents + // fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents + // fileDescription.dentry().upperVD. cachedFlags is the last known value of + // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are + // protected by mu. + mu sync.Mutex `state:"nosave"` + copiedUp bool + cachedFD *vfs.FileDescription + cachedFlags uint32 + + // If copiedUp is false, lowerWaiters contains all waiter.Entries + // registered with cachedFD. lowerWaiters is protected by mu. + lowerWaiters map[*waiter.Entry]waiter.EventMask +} + +func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return nil, err + } + wrappedFD.IncRef() + return wrappedFD, nil +} + +func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) { + d := fd.dentry() + statusFlags := fd.vfsfd.StatusFlags() + if !fd.copiedUp && d.isCopiedUp() { + // Switch to the copied-up file. + upperVD := d.topLayer() + upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: upperVD, + Start: upperVD, + }, &vfs.OpenOptions{ + Flags: statusFlags, + }) + if err != nil { + return nil, err + } + oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) + if oldOffErr == nil { + if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { + upperFD.DecRef(ctx) + return nil, err + } + } + if len(fd.lowerWaiters) != 0 { + ready := upperFD.Readiness(^waiter.EventMask(0)) + for e, mask := range fd.lowerWaiters { + fd.cachedFD.EventUnregister(e) + upperFD.EventRegister(e, mask) + if ready&mask != 0 { + e.Callback.Callback(e) + } + } + } + fd.cachedFD.DecRef(ctx) + fd.copiedUp = true + fd.cachedFD = upperFD + fd.cachedFlags = statusFlags + fd.lowerWaiters = nil + } else if fd.cachedFlags != statusFlags { + if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil { + return nil, err + } + fd.cachedFlags = statusFlags + } + return fd.cachedFD, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release(ctx context.Context) { + fd.cachedFD.DecRef(ctx) + fd.cachedFD = nil +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *regularFileFD) OnClose(ctx context.Context) error { + // Linux doesn't define ovl_file_operations.flush at all (i.e. its + // equivalent to OnClose is a no-op). We pass through to + // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been + // copied-up, since OnClose is mostly used to define post-close writeback, + // and if fd.cachedFD hasn't been updated then it can't have been used to + // mutate fd.dentry() anyway. + fd.mu.Lock() + if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags { + if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil { + fd.mu.Unlock() + return err + } + fd.cachedFlags = statusFlags + } + wrappedFD := fd.cachedFD + fd.mu.Unlock() + return wrappedFD.OnClose(ctx) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return linux.Statx{}, err + } + stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{ + Mask: layerMask, + Sync: opts.Sync, + }) + wrappedFD.DecRef(ctx) + if err != nil { + return linux.Statx{}, err + } + } + fd.dentry().statInternalTo(ctx, &opts, &stat) + return stat, nil +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Allocate(ctx, mode, offset, length) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + d := fd.dentry() + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + mnt := fd.vfsfd.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + // Changes to d's attributes are serialized by d.copyMu. + d.copyMu.Lock() + defer d.copyMu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return err + } + if err := wrappedFD.SetStat(ctx, opts); err != nil { + return err + } + d.updateAfterSetStatLocked(&opts) + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) + } + return nil +} + +// StatFS implements vfs.FileDescriptionImpl.StatFS. +func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) { + return fd.filesystem().statFS(ctx) +} + +// Readiness implements waiter.Waitable.Readiness. +func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { + ctx := context.Background() + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + // TODO(b/171089913): Just use fd.cachedFD since Readiness can't return + // an error. This is obviously wrong, but at least consistent with + // VFS1. + log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err) + fd.mu.Lock() + wrappedFD = fd.cachedFD + wrappedFD.IncRef() + fd.mu.Unlock() + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(context.Background()) + if err != nil { + // TODO(b/171089913): Just use fd.cachedFD since EventRegister can't + // return an error. This is obviously wrong, but at least consistent + // with VFS1. + log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err) + wrappedFD = fd.cachedFD + } + wrappedFD.EventRegister(e, mask) + if !fd.copiedUp { + if fd.lowerWaiters == nil { + fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask) + } + fd.lowerWaiters[e] = mask + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *regularFileFD) EventUnregister(e *waiter.Entry) { + fd.mu.Lock() + defer fd.mu.Unlock() + fd.cachedFD.EventUnregister(e) + if !fd.copiedUp { + delete(fd.lowerWaiters, e) + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Hold fd.mu during the read to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // Hold fd.mu during the write to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Write(ctx, src, opts) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Hold fd.mu during the seek to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Seek(ctx, offset, whence) +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + fd.mu.Lock() + if !fd.dentry().isCopiedUp() { + fd.mu.Unlock() + return nil + } + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + fd.mu.Unlock() + return err + } + wrappedFD.IncRef() + defer wrappedFD.DecRef(ctx) + fd.mu.Unlock() + return wrappedFD.Sync(ctx) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Ioctl(ctx, uio, args) +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if err := fd.ensureMappable(ctx, opts); err != nil { + return err + } + return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts) +} + +// ensureMappable ensures that fd.dentry().wrappedMappable is not nil. +func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + + // Fast path if we already have a Mappable for the current top layer. + if atomic.LoadUint32(&d.isMappable) != 0 { + return nil + } + + // Only permit mmap of regular files, since other file types may have + // unpredictable behavior when mmapped (e.g. /dev/zero). + if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG { + return syserror.ENODEV + } + + // Get a Mappable for the current top layer. + fd.mu.Lock() + defer fd.mu.Unlock() + d.copyMu.RLock() + defer d.copyMu.RUnlock() + if atomic.LoadUint32(&d.isMappable) != 0 { + return nil + } + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return err + } + if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil { + return err + } + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef(ctx) + opts.MappingIdentity = nil + } + // Use this Mappable for all mappings of this layer (unless we raced with + // another call to ensureMappable). + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if d.wrappedMappable == nil { + d.wrappedMappable = opts.Mappable + atomic.StoreUint32(&d.isMappable, 1) + } + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil { + return err + } + if !d.isCopiedUp() { + d.lowerMappings.AddMapping(ms, ar, offset, writable) + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable) + if !d.isCopiedUp() { + d.lowerMappings.RemoveMapping(ms, ar, offset, writable) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil { + return err + } + if !d.isCopiedUp() { + d.lowerMappings.AddMapping(ms, dstAR, offset, writable) + } + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.dataMu.RLock() + defer d.dataMu.RUnlock() + return d.wrappedMappable.Translate(ctx, required, optional, at) +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + return d.wrappedMappable.InvalidateUnsavable(ctx) +} diff --git a/pkg/sentry/fsimpl/overlay/save_restore.go b/pkg/sentry/fsimpl/overlay/save_restore.go new file mode 100644 index 000000000..054e17b17 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/save_restore.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/refsvfs2" +) + +func (d *dentry) afterLoad() { + if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 { + refsvfs2.Register(d, "overlay.dentry") + } +} diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index cab771211..e44b79b68 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type filesystemType struct{} // Name implements vfs.FilesystemType.Name. @@ -38,11 +39,15 @@ func (filesystemType) Name() string { return "pipefs" } +// Release implements vfs.FilesystemType.Release. +func (filesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("pipefs.filesystemType.GetFilesystem should never be called") } +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -63,9 +68,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. @@ -76,12 +81,15 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } // inode implements kernfs.Inode. +// +// +stateify savable type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeNoopRefCount - pipe *pipe.VFSPipe + locks vfs.FileLocks + pipe *pipe.VFSPipe ino uint64 uid auth.KUID @@ -114,7 +122,7 @@ func (i *inode) Mode() linux.FileMode { } // Stat implements kernfs.Inode.Stat. -func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) return linux.Statx{ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, @@ -142,12 +150,14 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. return syserror.EPERM } -// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement -// statfs, from which we should indicate PIPEFS_MAGIC. - // Open implements kernfs.Inode.Open. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks) +} + +// StatFS implements kernfs.Inode.StatFS. +func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil } // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read @@ -158,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf fs := mnt.Filesystem().Impl().(*filesystem) inode := newInode(ctx, fs) var d kernfs.Dentry - d.Init(inode) - defer d.DecRef() + d.Init(&fs.Filesystem, inode) + defer d.DecRef(ctx) return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) } diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 17c1342b5..5196a2a80 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -1,18 +1,79 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "fd_dir_inode_refs", + out = "fd_dir_inode_refs.go", + package = "proc", + prefix = "fdDirInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "fdDirInode", + }, +) + +go_template_instance( + name = "fd_info_dir_inode_refs", + out = "fd_info_dir_inode_refs.go", + package = "proc", + prefix = "fdInfoDirInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "fdInfoDirInode", + }, +) + +go_template_instance( + name = "subtasks_inode_refs", + out = "subtasks_inode_refs.go", + package = "proc", + prefix = "subtasksInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "subtasksInode", + }, +) + +go_template_instance( + name = "task_inode_refs", + out = "task_inode_refs.go", + package = "proc", + prefix = "taskInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "taskInode", + }, +) + +go_template_instance( + name = "tasks_inode_refs", + out = "tasks_inode_refs.go", + package = "proc", + prefix = "tasksInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "tasksInode", + }, +) + go_library( name = "proc", srcs = [ + "fd_dir_inode_refs.go", + "fd_info_dir_inode_refs.go", "filesystem.go", "subtasks.go", + "subtasks_inode_refs.go", "task.go", "task_fds.go", "task_files.go", + "task_inode_refs.go", "task_net.go", "tasks.go", "tasks_files.go", + "tasks_inode_refs.go", "tasks_sys.go", ], visibility = ["//pkg/sentry:internal"], @@ -21,7 +82,9 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", @@ -35,8 +98,10 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/tcpip/network/ipv4", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 609210253..99abcab66 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -17,6 +17,7 @@ package proc import ( "fmt" + "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -24,23 +25,29 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) -// Name is the default filesystem name. -const Name = "proc" +const ( + // Name is the default filesystem name. + Name = "proc" + defaultMaxCachedDentries = uint64(1000) +) // FilesystemType is the factory class for procfs. // // +stateify savable type FilesystemType struct{} -var _ vfs.FilesystemType = (*FilesystemType)(nil) - // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -61,9 +68,22 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF if err != nil { return nil, nil, err } + + mopts := vfs.GenericParseMountOptions(opts.Data) + maxCachedDentries := defaultMaxCachedDentries + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + maxCachedDentries, err = strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return nil, nil, syserror.EINVAL + } + } + procfs := &filesystem{ devMinor: devMinor, } + procfs.MaxCachedDentries = maxCachedDentries procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var cgroups map[string]string @@ -72,33 +92,35 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF cgroups = data.Cgroups } - _, dentry := procfs.newTasksInode(k, pidns, cgroups) + inode := procfs.newTasksInode(ctx, k, pidns, cgroups) + var dentry kernfs.Dentry + dentry.Init(&procfs.Filesystem, inode) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dynamicInode is an overfitted interface for common Inodes with // dynamicByteSource types used in procfs. +// +// +stateify savable type dynamicInode interface { kernfs.Inode vfs.DynamicBytesSource - Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) + Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) } -func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { - inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) - - d := &kernfs.Dentry{} - d.Init(inode) - return d +func (fs *filesystem) newInode(ctx context.Context, creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode { + inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm) + return inode } +// +stateify savable type staticFile struct { kernfs.DynamicBytesFile vfs.StaticData @@ -110,8 +132,24 @@ func newStaticFile(data string) *staticFile { return &staticFile{StaticData: vfs.StaticData{Data: data}} } +func (fs *filesystem) newStaticDir(ctx context.Context, creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode { + return kernfs.NewStaticDir(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) +} + // InternalData contains internal data passed in to the procfs mount via // vfs.GetFilesystemOptions.InternalData. +// +// +stateify savable type InternalData struct { Cgroups map[string]string } + +// +stateify savable +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 36a911db4..cb3c5e0fd 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -31,11 +31,16 @@ import ( // // +stateify savable type subtasksInode struct { - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + implStatFS + kernfs.InodeAlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren - kernfs.AlwaysValid + subtasksInodeRefs + + locks vfs.FileLocks fs *filesystem task *kernel.Task @@ -45,7 +50,7 @@ type subtasksInode struct { var _ kernfs.Inode = (*subtasksInode)(nil) -func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode { subInode := &subtasksInode{ fs: fs, task: task, @@ -53,18 +58,16 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. - subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + subInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + subInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: subInode, owner: task} - dentry := &kernfs.Dentry{} - dentry.Init(inode) - - return dentry + return inode } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { return nil, syserror.ENOENT @@ -77,13 +80,11 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e if subTask.ThreadGroup() != i.task.ThreadGroup() { return nil, syserror.ENOENT } - - subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) - return subTaskDentry.VFSDentry(), nil + return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) } -// IterDirents implements kernfs.inodeDynamicLookup. -func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { +// IterDirents implements kernfs.inodeDirectory.IterDirents. +func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { return offset, syserror.ENOENT @@ -113,6 +114,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb return offset, nil } +// +stateify savable type subtasksFD struct { kernfs.GenericDirectoryFD @@ -126,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac return fd.GenericDirectoryFD.IterDirents(ctx, cb) } -// Seek implements vfs.FileDecriptionImpl.Seek. +// Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { return 0, syserror.ENOENT @@ -150,21 +152,23 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro return fd.GenericDirectoryFD.SetStat(ctx, opts) } -// Open implements kernfs.Inode. -func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +// Open implements kernfs.Inode.Open. +func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} - if err := fd.Init(&i.OrderedChildren, &opts); err != nil { + if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }); err != nil { return nil, err } - if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd.VFSFileDescription(), nil } -// Stat implements kernfs.Inode. -func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - stat, err := i.InodeAttrs.Stat(vsfs, opts) +// Stat implements kernfs.Inode.Stat. +func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { return linux.Statx{}, err } @@ -174,7 +178,12 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux. return stat, nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } + +// DecRef implements kernfs.Inode.DecRef. +func (i *subtasksInode) DecRef(ctx context.Context) { + i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 482055db1..19011b010 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -32,91 +32,108 @@ import ( // // +stateify savable type taskInode struct { - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren - kernfs.InodeNoDynamicLookup + implStatFS kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren + taskInodeRefs + + locks vfs.FileLocks task *kernel.Task } var _ kernfs.Inode = (*taskInode)(nil) -func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { - // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. - contents := map[string]*kernfs.Dentry{ - "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), - "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), +func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) { + if task.ExitState() == kernel.TaskExitDead { + return nil, syserror.ESRCH + } + + contents := map[string]kernfs.Inode{ + "auxv": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), "comm": fs.newComm(task, fs.NextIno(), 0444), - "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "cwd": fs.newCwdSymlink(task, fs.NextIno()), + "environ": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), "exe": fs.newExeSymlink(task, fs.NextIno()), "fd": fs.newFDDirInode(task), "fdinfo": fs.newFDInfoDirInode(task), - "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}), - "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}), + "gid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}), + "mem": fs.newMemInode(task, fs.NextIno(), 0400), + "mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}), "net": fs.newTaskNetDir(task), - "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{ + "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{ "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), }), - "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")), - "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), - "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}), - "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}), - "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) } if len(cgroupControllers) > 0 { - contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) + contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) } taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. - taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: taskInode, owner: task} - dentry := &kernfs.Dentry{} - dentry.Init(inode) taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - links := taskInode.OrderedChildren.Populate(dentry, contents) + links := taskInode.OrderedChildren.Populate(contents) taskInode.IncLinks(links) - return dentry + return inode, nil } -// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long +// Valid implements kernfs.Inode.Valid. This inode remains valid as long // as the task is still running. When it's dead, another tasks with the same // PID could replace it. func (i *taskInode) Valid(ctx context.Context) bool { return i.task.ExitState() != kernel.TaskExitDead } -// Open implements kernfs.Inode. -func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +// Open implements kernfs.Inode.Open. +func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } +// DecRef implements kernfs.Inode.DecRef. +func (i *taskInode) DecRef(ctx context.Context) { + i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. +// +// +stateify savable type taskOwnedInode struct { kernfs.Inode @@ -126,36 +143,28 @@ type taskOwnedInode struct { var _ kernfs.Inode = (*taskOwnedInode)(nil) -func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) + inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) - taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d + return &taskOwnedInode{Inode: inode, owner: task} } -func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { - dir := &kernfs.StaticDirectory{} - +func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode { // Note: credentials are overridden by taskOwnedInode. - dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) - - inode := &taskOwnedInode{Inode: dir, owner: task} - d := &kernfs.Dentry{} - d.Init(inode) + fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero} + dir := kernfs.NewStaticDir(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts) - dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - links := dir.OrderedChildren.Populate(d, children) - dir.IncLinks(links) + return &taskOwnedInode{Inode: dir, owner: task} +} - return d +func (i *taskOwnedInode) Valid(ctx context.Context) bool { + return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx) } -// Stat implements kernfs.Inode. -func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - stat, err := i.Inode.Stat(fs, opts) +// Stat implements kernfs.Inode.Stat. +func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.Inode.Stat(ctx, fs, opts) if err != nil { return linux.Statx{}, err } @@ -171,7 +180,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S return stat, nil } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 44ccc9e4a..d268b44be 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -43,16 +42,19 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef() + file.DecRef(ctx) return true } +// +stateify savable type fdDir struct { + locks vfs.FileLocks + fs *filesystem task *kernel.Task @@ -61,16 +63,15 @@ type fdDir struct { produceSymlink bool } -// IterDirents implements kernfs.inodeDynamicLookup. -func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) { +// IterDirents implements kernfs.inodeDirectory.IterDirents. +func (i *fdDir) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) - offset := absOffset + relOffset typ := uint8(linux.DT_REG) if i.produceSymlink { typ = linux.DT_LNK @@ -86,31 +87,39 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs Name: strconv.FormatUint(uint64(fd), 10), Type: typ, Ino: i.fs.NextIno(), - NextOff: offset + 1, + NextOff: int64(fd) + 3, } if err := cb.Handle(dirent); err != nil { - return offset, err + // Getdents should iterate correctly despite mutation + // of fds, so we return the next fd to serialize plus + // 2 (which accounts for the "." and ".." tracked by + // kernfs) as the offset. + return int64(fd) + 2, err } - offset++ } - return offset, nil + // We serialized them all. Next offset should be higher than last + // serialized fd. + return int64(fds[len(fds)-1]) + 3, nil } // fdDirInode represents the inode for /proc/[pid]/fd directory. // // +stateify savable type fdDirInode struct { - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + fdDir + fdDirInodeRefs + implStatFS + kernfs.InodeAlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren - kernfs.AlwaysValid - fdDir } var _ kernfs.Inode = (*fdDirInode)(nil) -func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { +func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode { inode := &fdDirInode{ fdDir: fdDir{ fs: fs, @@ -118,39 +127,42 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { produceSymlink: true, }, } - inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) - - dentry := &kernfs.Dentry{} - dentry.Init(inode) + inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + return inode +} - return dentry +// IterDirents implements kernfs.inodeDirectory.IterDirents. +func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset) } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } - taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) - return taskDentry.VFSDentry(), nil + return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil } -// Open implements kernfs.Inode. -func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +// Open implements kernfs.Inode.Open. +func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } -// CheckPermissions implements kernfs.Inode. +// CheckPermissions implements kernfs.Inode.CheckPermissions. // // This is to match Linux, which uses a special permission handler to guarantee // that a process can still access /proc/self/fd after it has executed @@ -172,10 +184,16 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } +// DecRef implements kernfs.Inode.DecRef. +func (i *fdDirInode) DecRef(ctx context.Context) { + i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. // // +stateify savable type fdSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -186,26 +204,23 @@ type fdSymlink struct { var _ kernfs.Inode = (*fdSymlink)(nil) -func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ task: task, fd: fd, } - inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + return inode } -func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { +func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } @@ -214,75 +229,90 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil } +// Valid implements kernfs.Inode.Valid. +func (s *fdSymlink) Valid(ctx context.Context) bool { + return taskFDExists(ctx, s.task, s.fd) +} + // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. // // +stateify savable type fdInfoDirInode struct { - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + fdDir + fdInfoDirInodeRefs + implStatFS + kernfs.InodeAlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren - kernfs.AlwaysValid - fdDir } var _ kernfs.Inode = (*fdInfoDirInode)(nil) -func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { +func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode { inode := &fdInfoDirInode{ fdDir: fdDir{ fs: fs, task: task, }, } - inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) - - dentry := &kernfs.Dentry{} - dentry.Init(inode) + inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - - return dentry + return inode } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } data := &fdInfoData{ task: i.task, fd: fd, } - dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data) - return dentry.VFSDentry(), nil + return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil } -// Open implements kernfs.Inode. -func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +// IterDirents implements Inode.IterDirents. +func (i *fdInfoDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset) +} + +// Open implements kernfs.Inode.Open. +func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode.DecRef. +func (i *fdInfoDirInode) DecRef(ctx context.Context) { + i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. // // +stateify savable type fdInfoData struct { kernfs.DynamicBytesFile - refs.AtomicRefCount task *kernel.Task fd int32 @@ -296,7 +326,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { if file == nil { return syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt @@ -304,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "flags:\t0%o\n", flags) return nil } + +// Valid implements kernfs.Inode.Valid. +func (d *fdInfoData) Valid(ctx context.Context) bool { + return taskFDExists(ctx, d.task, d.fd) +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 2f297e48a..ba71d0fde 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -30,10 +31,15 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) +// "There is an (arbitrary) limit on the number of lines in the file. As at +// Linux 3.18, the limit is five lines." - user_namespaces(7) +const maxIDMapLines = 5 + // mm gets the kernel task's MemoryManager. No additional reference is taken on // mm here. This is safe because MemoryManager.destroy is required to leave the // MemoryManager in a state where it's still usable as a DynamicBytesSource. @@ -226,8 +232,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Linux will return envp up to and including the first NULL character, // so find it. - if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 { - buf.Truncate(end) + envStart := int(ar.Length()) + if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { + buf.Truncate(envStart + nullIdx) } } @@ -241,13 +248,10 @@ type commInode struct { task *kernel.Task } -func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { inode := &commInode{task: task} - inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + inode.DynamicBytesFile.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) + return inode } func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { @@ -282,7 +286,8 @@ func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}. +// idMapData implements vfs.WritableDynamicBytesSource for +// /proc/[pid]/{gid_map|uid_map}. // // +stateify savable type idMapData struct { @@ -294,7 +299,7 @@ type idMapData struct { var _ dynamicInode = (*idMapData)(nil) -// Generate implements vfs.DynamicBytesSource.Generate. +// Generate implements vfs.WritableDynamicBytesSource.Generate. func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { var entries []auth.IDMapEntry if d.gids { @@ -308,6 +313,216 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } +// Write implements vfs.WritableDynamicBytesSource.Write. +func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // "In addition, the number of bytes written to the file must be less than + // the system page size, and the write must be performed at the start of + // the file ..." - user_namespaces(7) + srclen := src.NumBytes() + if srclen >= usermem.PageSize || offset != 0 { + return 0, syserror.EINVAL + } + b := make([]byte, srclen) + if _, err := src.CopyIn(ctx, b); err != nil { + return 0, err + } + + // Truncate from the first NULL byte. + var nul int64 + nul = int64(bytes.IndexByte(b, 0)) + if nul == -1 { + nul = srclen + } + b = b[:nul] + // Remove the last \n. + if nul >= 1 && b[nul-1] == '\n' { + b = b[:nul-1] + } + lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) + if len(lines) > maxIDMapLines { + return 0, syserror.EINVAL + } + + entries := make([]auth.IDMapEntry, len(lines)) + for i, l := range lines { + var e auth.IDMapEntry + _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) + if err != nil { + return 0, syserror.EINVAL + } + entries[i] = e + } + var err error + if d.gids { + err = d.task.UserNamespace().SetGIDMap(ctx, entries) + } else { + err = d.task.UserNamespace().SetUIDMap(ctx, entries) + } + if err != nil { + return 0, err + } + + // On success, Linux's kernel/user_namespace.c:map_write() always returns + // count, even if fewer bytes were used. + return int64(srclen), nil +} + +var _ kernfs.Inode = (*memInode)(nil) + +// memInode implements kernfs.Inode for /proc/[pid]/mem. +// +// +stateify savable +type memInode struct { + kernfs.InodeAttrs + kernfs.InodeNoStatFS + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + task *kernel.Task + locks vfs.FileLocks +} + +func (fs *filesystem) newMemInode(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { + // Note: credentials are overridden by taskOwnedInode. + inode := &memInode{task: task} + inode.init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) + return &taskOwnedInode{Inode: inode, owner: task} +} + +func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) +} + +// Open implements kernfs.Inode.Open. +func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS + // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS + // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH + if !kernel.ContextCanTrace(ctx, f.task, true) { + return nil, syserror.EACCES + } + if err := checkTaskState(f.task); err != nil { + return nil, err + } + fd := &memFD{} + if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return syserror.EPERM +} + +var _ vfs.FileDescriptionImpl = (*memFD)(nil) + +// memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem. +// +// +stateify savable +type memFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + inode *memInode + + // mu guards the fields below. + mu sync.Mutex `state:"nosave"` + offset int64 +} + +// Init initializes memFD. +func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error { + fd.LockFD.Init(&inode.locks) + if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return err + } + fd.inode = inode + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + case linux.SEEK_CUR: + offset += fd.offset + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.offset = offset + return offset, nil +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + m, err := getMMIncRef(fd.inode.task) + if err != nil { + return 0, nil + } + defer m.DecUsers(ctx) + // Buffer the read data because of MM locks + buf := make([]byte, dst.NumBytes()) + n, readErr := m.CopyIn(ctx, usermem.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) + if n > 0 { + if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { + return 0, syserror.EFAULT + } + return int64(n), nil + } + if readErr != nil { + return 0, syserror.EIO + } + return 0, nil +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.offset, opts) + fd.offset += n + fd.mu.Unlock() + return n, err +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *memFD) Release(context.Context) {} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *memFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *memFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} + // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable @@ -482,7 +697,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss, data uint64 s.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.Size() + fds = fdTable.CurrentMaxFDs() } if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() @@ -587,6 +802,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset // // +stateify savable type exeSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -596,29 +812,30 @@ type exeSymlink struct { var _ kernfs.Inode = (*exeSymlink)(nil) -func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode { inode := &exeSymlink{task: task} - inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d + inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + return inode } -// Readlink implements kernfs.Inode. -func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { - if !kernel.ContextCanTrace(ctx, s.task, false) { - return "", syserror.EACCES - } - - // Pull out the executable for /proc/[pid]/exe. - exec, err := s.executable() +// Readlink implements kernfs.Inode.Readlink. +func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { + exec, _, err := s.Getlink(ctx, nil) if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) + + root := vfs.RootFromContext(ctx) + if !root.Ok() { + // It could have raced with process deletion. + return "", syserror.ESRCH + } + defer root.DecRef(ctx) - return exec.PathnameWithDeleted(ctx), nil + vfsObj := exec.Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) + return name, nil } // Getlink implements kernfs.Inode.Getlink. @@ -626,23 +843,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent if !kernel.ContextCanTrace(ctx, s.task, false) { return vfs.VirtualDentry{}, "", syserror.EACCES } - - exec, err := s.executable() - if err != nil { - return vfs.VirtualDentry{}, "", err - } - defer exec.DecRef() - - vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() - vd.IncRef() - return vd, "", nil -} - -func (s *exeSymlink) executable() (file fsbridge.File, err error) { if err := checkTaskState(s.task); err != nil { - return nil, err + return vfs.VirtualDentry{}, "", err } + var err error + var exec fsbridge.File s.task.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { @@ -653,12 +859,75 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) { // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). - file = mm.Executable() - if file == nil { + exec = mm.Executable() + if exec == nil { err = syserror.ESRCH } }) - return + if err != nil { + return vfs.VirtualDentry{}, "", err + } + defer exec.DecRef(ctx) + + vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() + vd.IncRef() + return vd, "", nil +} + +// cwdSymlink is an symlink for the /proc/[pid]/cwd file. +// +// +stateify savable +type cwdSymlink struct { + implStatFS + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task +} + +var _ kernfs.Inode = (*cwdSymlink)(nil) + +func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode { + inode := &cwdSymlink{task: task} + inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) + return inode +} + +// Readlink implements kernfs.Inode.Readlink. +func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { + cwd, _, err := s.Getlink(ctx, nil) + if err != nil { + return "", err + } + defer cwd.DecRef(ctx) + + root := vfs.RootFromContext(ctx) + if !root.Ok() { + // It could have raced with process deletion. + return "", syserror.ESRCH + } + defer root.DecRef(ctx) + + vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) + return name, nil +} + +// Getlink implements kernfs.Inode.Getlink. +func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return vfs.VirtualDentry{}, "", syserror.EACCES + } + if err := checkTaskState(s.task); err != nil { + return vfs.VirtualDentry{}, "", err + } + cwd := s.task.FSContext().WorkingDirectoryVFS2() + if !cwd.Ok() { + // It could have raced with process deletion. + return vfs.VirtualDentry{}, "", syserror.ESRCH + } + return cwd, "", nil } // mountInfoData is used to implement /proc/[pid]/mountinfo. @@ -687,7 +956,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -718,18 +987,19 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } +// +stateify savable type namespaceSymlink struct { kernfs.StaticSymlink task *kernel.Task } -func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { +func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. @@ -737,61 +1007,68 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) + inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d + return taskInode } -// Readlink implements Inode. -func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { +// Readlink implements kernfs.Inode.Readlink. +func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if err := checkTaskState(s.task); err != nil { return "", err } - return s.StaticSymlink.Readlink(ctx) + return s.StaticSymlink.Readlink(ctx, mnt) } -// Getlink implements Inode.Getlink. +// Getlink implements kernfs.Inode.Getlink. func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } // Create a synthetic inode to represent the namespace. + fs := mnt.Filesystem().Impl().(*filesystem) + nsInode := &namespaceInode{} + nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444) dentry := &kernfs.Dentry{} - dentry.Init(&namespaceInode{}) + dentry.Init(&fs.Filesystem, nsInode) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) - vd.IncRef() - dentry.DecRef() + // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. + mnt.IncRef() return vd, "", nil } // namespaceInode is a synthetic inode created to represent a namespace in // /proc/[pid]/ns/*. +// +// +stateify savable type namespaceInode struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeNotDirectory kernfs.InodeNotSymlink + + locks vfs.FileLocks } var _ kernfs.Inode = (*namespaceInode)(nil) // Init initializes a namespace inode. -func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { +func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) + i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } -// Open implements Inode.Open. -func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +// Open implements kernfs.Inode.Open. +func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() - if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + fd.LockFD.Init(&i.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -799,8 +1076,11 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * // namespace FD is a synthetic file that represents a namespace in // /proc/[pid]/ns/*. +// +// +stateify savable type namespaceFD struct { vfs.FileDescriptionDefaultImpl + vfs.LockFD vfsfd vfs.FileDescription inode *namespaceInode @@ -808,25 +1088,30 @@ type namespaceFD struct { var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) -// Stat implements FileDescriptionImpl. +// Stat implements vfs.FileDescriptionImpl.Stat. func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() - return fd.inode.Stat(vfs, opts) + return fd.inode.Stat(ctx, vfs, opts) } -// SetStat implements FileDescriptionImpl. +// SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() creds := auth.CredentialsFromContext(ctx) return fd.inode.SetStat(ctx, vfs, creds, opts) } -// Release implements FileDescriptionImpl. -func (fd *namespaceFD) Release() { - fd.inode.DecRef() +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *namespaceFD) Release(ctx context.Context) { + fd.inode.DecRef(ctx) } -// OnClose implements FileDescriptionImpl. -func (*namespaceFD) OnClose(context.Context) error { - return nil +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6bde27376..5a9ee111f 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -37,12 +37,12 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { +func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) - var contents map[string]*kernfs.Dentry + var contents map[string]kernfs.Inode if stack := task.NetworkNamespace().Stack(); stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" @@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. - contents = map[string]*kernfs.Dentry{ - "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}), - "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}), + contents = map[string]kernfs.Inode{ + "dev": fs.newInode(task, root, 0444, &netDevData{stack: stack}), + "snmp": fs.newInode(task, root, 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. - "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)), - "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)), - "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}), - "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)), - "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)), + "arp": fs.newInode(task, root, 0444, newStaticFile(arp)), + "netlink": fs.newInode(task, root, 0444, newStaticFile(netlink)), + "netstat": fs.newInode(task, root, 0444, &netStatData{}), + "packet": fs.newInode(task, root, 0444, newStaticFile(packet)), + "protocols": fs.newInode(task, root, 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). - "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)), - "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)), - "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}), - "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}), - "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}), - "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}), + "psched": fs.newInode(task, root, 0444, newStaticFile(psched)), + "ptype": fs.newInode(task, root, 0444, newStaticFile(ptype)), + "route": fs.newInode(task, root, 0444, &netRouteData{stack: stack}), + "tcp": fs.newInode(task, root, 0444, &netTCPData{kernel: k}), + "udp": fs.newInode(task, root, 0444, &netUDPData{kernel: k}), + "unix": fs.newInode(task, root, 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { - contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack}) - contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")) - contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k}) - contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6)) + contents["if_inet6"] = fs.newInode(task, root, 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newInode(task, root, 0444, newStaticFile("")) + contents["tcp6"] = fs.newInode(task, root, 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newInode(task, root, 0444, newStaticFile(upd6)) } } @@ -212,7 +212,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { continue } if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // For now, we always redact this pointer. fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - s.Refs()-1, // RefCount, don't count our own ref. + s.ReadRefs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. @@ -281,7 +281,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } @@ -359,7 +359,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -455,7 +455,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil @@ -524,7 +524,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -600,7 +600,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } @@ -616,6 +616,7 @@ type netSnmpData struct { var _ dynamicInode = (*netSnmpData)(nil) +// +stateify savable type snmpLine struct { prefix string header string @@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string { return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. } -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { types := []interface{}{ &inet.StatSNMPIP{}, @@ -709,7 +710,7 @@ type netRouteData struct { var _ dynamicInode = (*netRouteData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") @@ -773,7 +774,7 @@ type netStatData struct { var _ dynamicInode = (*netStatData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " + diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index b51d43954..b81ea14bf 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -37,19 +37,22 @@ const ( // // +stateify savable type tasksInode struct { - kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren + implStatFS + kernfs.InodeAlwaysValid kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.OrderedChildren - kernfs.AlwaysValid + tasksInodeRefs + + locks vfs.FileLocks fs *filesystem pidns *kernel.PIDNamespace // '/proc/self' and '/proc/thread-self' have custom directory offsets in // Linux. So handle them outside of OrderedChildren. - selfSymlink *vfs.Dentry - threadSelfSymlink *vfs.Dentry // cgroupControllers is a map of controller name to directory in the // cgroup hierarchy. These controllers are immutable and will be listed @@ -59,51 +62,53 @@ type tasksInode struct { var _ kernfs.Inode = (*tasksInode)(nil) -func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { +func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode { root := auth.NewRootCredentials(pidns.UserNamespace()) - contents := map[string]*kernfs.Dentry{ - "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), - "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}), - "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}), - "sys": fs.newSysDir(root, k), - "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), - "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), - "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}), - "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}), - "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}), + contents := map[string]kernfs.Inode{ + "cpuinfo": fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}), + "loadavg": fs.newInode(ctx, root, 0444, &loadavgData{}), + "sys": fs.newSysDir(ctx, root, k), + "meminfo": fs.newInode(ctx, root, 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), + "stat": fs.newInode(ctx, root, 0444, &statData{}), + "uptime": fs.newInode(ctx, root, 0444, &uptimeData{}), + "version": fs.newInode(ctx, root, 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, fs: fs, - selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), - threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } - inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) - - dentry := &kernfs.Dentry{} - dentry.Init(inode) + inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - links := inode.OrderedChildren.Populate(dentry, contents) + links := inode.OrderedChildren.Populate(contents) inode.IncLinks(links) - return inode, dentry + return inode } -// Lookup implements kernfs.inodeDynamicLookup. -func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - // Try to lookup a corresponding task. +// Lookup implements kernfs.inodeDirectory.Lookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { + // Check if a static entry was looked up. + if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil { + return d, nil + } + + // Not a static entry. Try to lookup a corresponding task. tid, err := strconv.ParseUint(name, 10, 64) if err != nil { + root := auth.NewRootCredentials(i.pidns.UserNamespace()) // If it failed to parse, check if it's one of the special handled files. switch name { case selfName: - return i.selfSymlink, nil + return i.newSelfSymlink(ctx, root), nil case threadSelfName: - return i.threadSelfSymlink, nil + return i.newThreadSelfSymlink(ctx, root), nil } return nil, syserror.ENOENT } @@ -113,12 +118,11 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) - return taskDentry.VFSDentry(), nil + return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) } -// IterDirents implements kernfs.inodeDynamicLookup. -func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { +// IterDirents implements kernfs.inodeDirectory.IterDirents. +func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 const FIRST_PROCESS_ENTRY = 256 @@ -195,17 +199,19 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback return maxTaskID, nil } -// Open implements kernfs.Inode. -func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) +// Open implements kernfs.Inode.Open. +func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } -func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - stat, err := i.InodeAttrs.Stat(vsfs, opts) +func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { return linux.Statx{}, err } @@ -222,9 +228,16 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Sta return stat, nil } +// DecRef implements kernfs.Inode.DecRef. +func (i *tasksInode) DecRef(ctx context.Context) { + i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. +// +// +stateify savable type staticFileSetStat struct { dynamicBytesFileSetAttr vfs.StaticData diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 7d8983aa5..01b7a6678 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -31,7 +31,9 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type selfSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -41,16 +43,13 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { - inode := &selfSymlink{pidns: pidns} - inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d +func (i *tasksInode) newSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode { + inode := &selfSymlink{pidns: i.pidns} + inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777) + return inode } -func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { +func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -63,17 +62,19 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } -func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx) +func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } +// +stateify savable type threadSelfSymlink struct { + implStatFS kernfs.InodeAttrs kernfs.InodeNoopRefCount kernfs.InodeSymlink @@ -83,16 +84,13 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { - inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) - - d := &kernfs.Dentry{} - d.Init(inode) - return d +func (i *tasksInode) newThreadSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode { + inode := &threadSelfSymlink{pidns: i.pidns} + inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777) + return inode } -func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { +func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { t := kernel.TaskFromContext(ctx) if t == nil { // Who is reading this link? @@ -106,12 +104,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } -func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { - target, err := s.Readlink(ctx) +func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx, mnt) return vfs.VirtualDentry{}, target, err } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -119,16 +117,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden // dynamicBytesFileSetAttr implements a special file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. +// +// +stateify savable type dynamicBytesFileSetAttr struct { kernfs.DynamicBytesFile } -// SetStat implements Inode.SetStat. +// SetStat implements kernfs.Inode.SetStat. func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) } // cpuStats contains the breakdown of CPU time for /proc/stat. +// +// +stateify savable type cpuStats struct { // user is time spent in userspace tasks with non-positive niceness. user uint64 diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 6dac2afa4..7c7afdcfa 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -25,94 +25,108 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable +type tcpMemDir int + +const ( + tcpRMem tcpMemDir = iota + tcpWMem +) + // newSysDir returns the dentry corresponding to /proc/sys directory. -func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { - return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), - "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), - "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), - "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), +func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { + return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ + "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ + "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}), + "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)), + "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)), + "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)), }), - "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), - "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), + "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ + "mmap_min_addr": fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}), + "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")), }), - "net": fs.newSysNetDir(root, k), + "net": fs.newSysNetDir(ctx, root, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. -func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { - var contents map[string]*kernfs.Dentry +func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode { + var contents map[string]kernfs.Inode // TODO(gvisor.dev/issue/1833): Support for using the network stack in the // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { - contents = map[string]*kernfs.Dentry{ - "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), + contents = map[string]kernfs.Inode{ + "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ + "tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}), + "tcp_rmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}), + "tcp_sack": fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}), + "tcp_wmem": fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}), + "ip_forward": fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). - "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")), - "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), - "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")), - "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "ip_local_port_range": fs.newInode(ctx, root, 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")), + "ipfrag_time": fs.newInode(ctx, root, 0444, newStaticFile("30")), + "ip_nonlocal_bind": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "ip_no_pmtu_disc": fs.newInode(ctx, root, 0444, newStaticFile("1")), // tcp_allowed_congestion_control tell the user what they are able to // do as an unprivledged process so we leave it empty. - "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), - "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), - "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + "tcp_allowed_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("")), + "tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), + "tcp_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")), // Many of the following stub files are features netstack doesn't // support. The unsupported features return "0" to indicate they are // disabled. - "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")), - "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), - "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")), - "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), - "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), - "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")), - "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), - "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), - "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), - "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), - "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_base_mss": fs.newInode(ctx, root, 0444, newStaticFile("1280")), + "tcp_dsack": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_early_retrans": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_fack": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_fastopen": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_fastopen_key": fs.newInode(ctx, root, 0444, newStaticFile("")), + "tcp_invalid_ratelimit": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_keepalive_intvl": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_keepalive_probes": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_keepalive_time": fs.newInode(ctx, root, 0444, newStaticFile("7200")), + "tcp_mtu_probing": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_no_metrics_save": fs.newInode(ctx, root, 0444, newStaticFile("1")), + "tcp_probe_interval": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_probe_threshold": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "tcp_retries1": fs.newInode(ctx, root, 0444, newStaticFile("3")), + "tcp_retries2": fs.newInode(ctx, root, 0444, newStaticFile("15")), + "tcp_rfc1337": fs.newInode(ctx, root, 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")), + "tcp_synack_retries": fs.newInode(ctx, root, 0444, newStaticFile("5")), + "tcp_syn_retries": fs.newInode(ctx, root, 0444, newStaticFile("3")), + "tcp_timestamps": fs.newInode(ctx, root, 0444, newStaticFile("1")), }), - "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), - "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), - "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), - "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), - "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), - "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), - "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")), - "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), - "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ + "default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")), + "message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")), + "message_cost": fs.newInode(ctx, root, 0444, newStaticFile("5")), + "optmem_max": fs.newInode(ctx, root, 0444, newStaticFile("0")), + "rmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), + "rmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), + "somaxconn": fs.newInode(ctx, root, 0444, newStaticFile("128")), + "wmem_default": fs.newInode(ctx, root, 0444, newStaticFile("212992")), + "wmem_max": fs.newInode(ctx, root, 0444, newStaticFile("212992")), }), } } - return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) + return fs.newStaticDir(ctx, root, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for @@ -163,7 +177,7 @@ type tcpSackData struct { var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) -// Generate implements vfs.DynamicBytesSource. +// Generate implements vfs.DynamicBytesSource.Generate. func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { if d.enabled == nil { sack, err := d.stack.TCPSACKEnabled() @@ -180,10 +194,11 @@ func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Tough luck. val = "1\n" } - buf.WriteString(val) - return nil + _, err := buf.WriteString(val) + return err } +// Write implements vfs.WritableDynamicBytesSource.Write. func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { if offset != 0 { // No need to handle partial writes thus far. @@ -199,7 +214,7 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset var v int32 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) if err != nil { - return n, err + return 0, err } if d.enabled == nil { d.enabled = new(bool) @@ -207,3 +222,198 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset *d.enabled = v != 0 return n, d.stack.SetTCPSACKEnabled(*d.enabled) } + +// tcpRecoveryData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/tcp_recovery. +// +// +stateify savable +type tcpRecoveryData struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` +} + +var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { + recovery, err := d.stack.TCPRecovery() + if err != nil { + return err + } + + _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery)) + return err +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil { + return 0, err + } + return n, nil +} + +// tcpMemData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem. +// +// +stateify savable +type tcpMemData struct { + kernfs.DynamicBytesFile + + dir tcpMemDir + stack inet.Stack `state:"wait"` + + // mu protects against concurrent reads/writes to FDs based on the dentry + // backing this byte source. + mu sync.Mutex `state:"nosave"` +} + +var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error { + d.mu.Lock() + defer d.mu.Unlock() + + size, err := d.readSizeLocked() + if err != nil { + return err + } + _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max)) + return err +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + d.mu.Lock() + defer d.mu.Unlock() + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + size, err := d.readSizeLocked() + if err != nil { + return 0, err + } + buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)} + n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts) + if err != nil { + return 0, err + } + newSize := inet.TCPBufferSize{ + Min: int(buf[0]), + Default: int(buf[1]), + Max: int(buf[2]), + } + if err := d.writeSizeLocked(newSize); err != nil { + return 0, err + } + return n, nil +} + +// Precondition: d.mu must be locked. +func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) { + switch d.dir { + case tcpRMem: + return d.stack.TCPReceiveBufferSize() + case tcpWMem: + return d.stack.TCPSendBufferSize() + default: + panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) + } +} + +// Precondition: d.mu must be locked. +func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { + switch d.dir { + case tcpRMem: + return d.stack.SetTCPReceiveBufferSize(size) + case tcpWMem: + return d.stack.SetTCPSendBufferSize(size) + default: + panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir)) + } +} + +// ipForwarding implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/ip_forwarding. +// +// +stateify savable +type ipForwarding struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` + enabled *bool +} + +var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error { + if ipf.enabled == nil { + enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber) + ipf.enabled = &enabled + } + + val := "0\n" + if *ipf.enabled { + // Technically, this is not quite compatible with Linux. Linux stores these + // as an integer, so if you write "2" into tcp_sack, you should get 2 back. + // Tough luck. + val = "1\n" + } + buf.WriteString(val) + + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + if ipf.enabled == nil { + ipf.enabled = new(bool) + } + *ipf.enabled = v != 0 + if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil { + return 0, err + } + return n, nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go index be54897bb..6cee22823 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -20,8 +20,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/usermem" ) func newIPv6TestStack() *inet.TestStack { @@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) { t.Errorf("Got n.contents() = %v, want = %v", got, want) } } + +// TestIPForwarding tests the implementation of +// /proc/sys/net/ipv4/ip_forwarding +func TestConfigureIPForwarding(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + + var cases = []struct { + comment string + initial bool + str string + final bool + }{ + { + comment: `Forwarding is disabled; write 1 and enable forwarding`, + initial: false, + str: "1", + final: true, + }, + { + comment: `Forwarding is disabled; write 0 and disable forwarding`, + initial: false, + str: "0", + final: false, + }, + { + comment: `Forwarding is enabled; write 1 and enable forwarding`, + initial: true, + str: "1", + final: true, + }, + { + comment: `Forwarding is enabled; write 0 and disable forwarding`, + initial: true, + str: "0", + final: false, + }, + { + comment: `Forwarding is disabled; write 2404 and enable forwarding`, + initial: false, + str: "2404", + final: true, + }, + { + comment: `Forwarding is enabled; write 2404 and enable forwarding`, + initial: true, + str: "2404", + final: true, + }, + } + for _, c := range cases { + t.Run(c.comment, func(t *testing.T) { + s.IPForwarding = c.initial + + file := &ipForwarding{stack: s, enabled: &c.initial} + + // Write the values. + src := usermem.BytesIOSequence([]byte(c.str)) + if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil { + t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str)) + } + + // Read the values from the stack and check them. + if got, want := s.IPForwarding, c.final; got != want { + t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 19abb5034..7ee6227a9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -67,6 +67,7 @@ var ( taskStaticFiles = map[string]testutil.DirentType{ "auxv": linux.DT_REG, "cgroup": linux.DT_REG, + "cwd": linux.DT_LNK, "cmdline": linux.DT_REG, "comm": linux.DT_REG, "environ": linux.DT_REG, @@ -76,6 +77,7 @@ var ( "gid_map": linux.DT_REG, "io": linux.DT_REG, "maps": linux.DT_REG, + "mem": linux.DT_REG, "mountinfo": linux.DT_REG, "mounts": linux.DT_REG, "net": linux.DT_DIR, @@ -104,13 +106,16 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } + root := mntns.Root() + root.IncRef() + defer root.DecRef(ctx) pop := &vfs.PathOperation{ - Root: mntns.Root(), - Start: mntns.Root(), + Root: root, + Start: root, Path: fspath.Parse("/proc"), } if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil { @@ -118,8 +123,8 @@ func setup(t *testing.T) *testutil.System { } pop = &vfs.PathOperation{ - Root: mntns.Root(), - Start: mntns.Root(), + Root: root, + Start: root, Path: fspath.Parse("/proc"), } mntOpts := &vfs.MountOptions{ @@ -132,7 +137,7 @@ func setup(t *testing.T) *testutil.System { }, }, } - if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { + if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { t.Fatalf("MountAt(/proc): %v", err) } return testutil.NewSystem(ctx, t, k.VFS(), mntns) @@ -218,7 +223,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -336,7 +341,7 @@ func TestTasksOffset(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -441,7 +446,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } - defer child.DecRef() + defer child.DecRef(ctx) stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("Stat(%v) failed: %v", absPath, err) @@ -476,7 +481,7 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("failed to create test file: %v", err) } - defer file.DecRef() + defer file.DecRef(s.Ctx) var tasks []*kernel.Task for i := 0; i < 5; i++ { @@ -501,5 +506,5 @@ func TestTree(t *testing.T) { t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) - fd.DecRef() + fd.DecRef(ctx) } diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD index 067c1657f..adb610213 100644 --- a/pkg/sentry/fsimpl/signalfd/BUILD +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -8,7 +8,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/sentry/kernel", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index d29ef3f83..10f1452ef 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -16,7 +16,6 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -26,11 +25,14 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// SignalFileDescription implements FileDescriptionImpl for signal fds. +// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds. +// +// +stateify savable type SignalFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // target is the original signal target task. // @@ -42,7 +44,7 @@ type SignalFileDescription struct { target *kernel.Task // mu protects mask. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // mask is the signal mask. Protected by mu. mask linux.SignalSet @@ -53,7 +55,7 @@ var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) // New creates a new signal fd. func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[signalfd]") - defer vd.DecRef() + defer vd.DecRef(target) sfd := &SignalFileDescription{ target: target, mask: mask, @@ -82,7 +84,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { sfd.mask = mask } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { // Attempt to dequeue relevant signals. info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) @@ -92,8 +94,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen } // Copy out the signal info using the specified format. - var buf [128]byte - binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + infoNative := linux.SignalfdSiginfo{ Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, @@ -102,9 +103,13 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), - }) - n, err := dst.CopyOut(ctx, buf[:]) - return int64(n), err + } + n, err := infoNative.WriteTo(dst.Writer(ctx)) + if err == usermem.ErrEndOfIOSequence { + // Partial copy-out ok. + err = nil + } + return n, err } // Readiness implements waiter.Waitable.Readiness. @@ -131,5 +136,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { sfd.target.SignalUnregister(entry) } -// Release implements FileDescriptionImpl.Release() -func (sfd *SignalFileDescription) Release() {} +// Release implements vfs.FileDescriptionImpl.Release. +func (sfd *SignalFileDescription) Release(context.Context) {} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index ee0828a15..fda1fa942 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -28,14 +28,16 @@ import ( ) // filesystemType implements vfs.FilesystemType. +// +// +stateify savable type filesystemType struct{} -// GetFilesystem implements FilesystemType.GetFilesystem. +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { panic("sockfs.filesystemType.GetFilesystem should never be called") } -// Name implements FilesystemType.Name. +// Name implements vfs.FilesystemType.Name. // // Note that registering sockfs is unnecessary, except for the fact that it // will not show up under /proc/filesystems as a result. This is a very minor @@ -44,6 +46,10 @@ func (filesystemType) Name() string { return "sockfs" } +// Release implements vfs.FilesystemType.Release. +func (filesystemType) Release(ctx context.Context) {} + +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -67,9 +73,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. @@ -80,30 +86,37 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } // inode implements kernfs.Inode. +// +// +stateify savable type inode struct { - kernfs.InodeNotDirectory - kernfs.InodeNotSymlink kernfs.InodeAttrs kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink } // Open implements kernfs.Inode.Open. -func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ENXIO } +// StatFS implements kernfs.Inode.StatFS. +func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil +} + // NewDentry constructs and returns a sockfs dentry. // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). -func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry { +func NewDentry(ctx context.Context, mnt *vfs.Mount) *vfs.Dentry { fs := mnt.Filesystem().Impl().(*filesystem) // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) i := &inode{} - i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) + i.InodeAttrs.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) d := &kernfs.Dentry{} - d.Init(i) + d.Init(&fs.Filesystem, i) return d.VFSDentry() } diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index a741e2bb6..09043b572 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -1,21 +1,42 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "dir_refs", + out = "dir_refs.go", + package = "sys", + prefix = "dir", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "dir", + }, +) + go_library( name = "sys", srcs = [ + "dir_refs.go", + "kcov.go", "sys.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/coverage", + "//pkg/log", + "//pkg/refs", + "//pkg/refsvfs2", + "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -29,6 +50,6 @@ go_test( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go new file mode 100644 index 000000000..b13f141a8 --- /dev/null +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -0,0 +1,118 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sys + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode { + k := &kcovInode{} + k.InodeAttrs.Init(ctx, creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600) + return k +} + +// kcovInode implements kernfs.Inode. +// +// +stateify savable +type kcovInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + implStatFS +} + +func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + k := kernel.KernelFromContext(ctx) + if k == nil { + panic("KernelFromContext returned nil") + } + fd := &kcovFD{ + inode: i, + kcov: k.NewKcov(), + } + + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// +stateify savable +type kcovFD struct { + vfs.FileDescriptionDefaultImpl + vfs.NoLockFD + + vfsfd vfs.FileDescription + inode *kcovInode + kcov *kernel.Kcov +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + cmd := uint32(args[1].Int()) + arg := args[2].Uint64() + switch uint32(cmd) { + case linux.KCOV_INIT_TRACE: + return 0, fd.kcov.InitTrace(arg) + case linux.KCOV_ENABLE: + return 0, fd.kcov.EnableTrace(ctx, uint8(arg)) + case linux.KCOV_DISABLE: + if arg != 0 { + // This arg is unused; it should be 0. + return 0, syserror.EINVAL + } + return 0, fd.kcov.DisableTrace(ctx) + default: + return 0, syserror.ENOTTY + } +} + +// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap. +func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.kcov.ConfigureMMap(ctx, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *kcovFD) Release(ctx context.Context) { + // kcov instances have reference counts in Linux, but this seems sufficient + // for our purposes. + fd.kcov.Clear(ctx) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts) +} diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 0af373604..7d2147141 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -18,9 +18,11 @@ package sys import ( "bytes" "fmt" + "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/coverage" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -28,13 +30,21 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// Name is the default filesystem name. -const Name = "sysfs" +const ( + // Name is the default filesystem name. + Name = "sysfs" + defaultSysDirMode = linux.FileMode(0755) + defaultMaxCachedDentries = uint64(1000) +) // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { kernfs.Filesystem @@ -46,6 +56,9 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() @@ -53,97 +66,157 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, err } + mopts := vfs.GenericParseMountOptions(opts.Data) + maxCachedDentries := defaultMaxCachedDentries + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + maxCachedDentries, err = strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return nil, nil, syserror.EINVAL + } + } + fs := &filesystem{ devMinor: devMinor, } + fs.MaxCachedDentries = maxCachedDentries fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - k := kernel.KernelFromContext(ctx) - maxCPUCores := k.ApplicationCores() - defaultSysDirMode := linux.FileMode(0755) - root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "block": fs.newDir(creds, defaultSysDirMode, nil), - "bus": fs.newDir(creds, defaultSysDirMode, nil), - "class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "power_supply": fs.newDir(creds, defaultSysDirMode, nil), + root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ + "block": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "bus": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "class": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ + "power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil), }), - "dev": fs.newDir(creds, defaultSysDirMode, nil), - "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - }), + "dev": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "devices": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ + "system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ + "cpu": cpuDir(ctx, fs, creds), }), }), - "firmware": fs.newDir(creds, defaultSysDirMode, nil), - "fs": fs.newDir(creds, defaultSysDirMode, nil), - "kernel": fs.newDir(creds, defaultSysDirMode, nil), - "module": fs.newDir(creds, defaultSysDirMode, nil), - "power": fs.newDir(creds, defaultSysDirMode, nil), + "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "fs": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "kernel": kernelDir(ctx, fs, creds), + "module": fs.newDir(ctx, creds, defaultSysDirMode, nil), + "power": fs.newDir(ctx, creds, defaultSysDirMode, nil), }) - return fs.VFSFilesystem(), root.VFSDentry(), nil + var rootD kernfs.Dentry + rootD.Init(&fs.Filesystem, root) + return fs.VFSFilesystem(), rootD.VFSDentry(), nil +} + +func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { + k := kernel.KernelFromContext(ctx) + maxCPUCores := k.ApplicationCores() + children := map[string]kernfs.Inode{ + "online": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), + "possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), + "present": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), + } + for i := uint(0); i < maxCPUCores; i++ { + children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil) + } + return fs.newDir(ctx, creds, defaultSysDirMode, children) +} + +func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { + // If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs + // should be mounted at debug/, but for our purposes, it is sufficient to + // keep it in sys. + var children map[string]kernfs.Inode + if coverage.KcovAvailable() { + children = map[string]kernfs.Inode{ + "debug": fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{ + "kcov": fs.newKcovFile(ctx, creds), + }), + } + } + return fs.newDir(ctx, creds, defaultSysDirMode, children) } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dir implements kernfs.Inode. +// +// +stateify savable type dir struct { + dirRefs + kernfs.InodeAlwaysValid kernfs.InodeAttrs - kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren - + kernfs.InodeTemporary kernfs.OrderedChildren - dentry kernfs.Dentry + + locks vfs.FileLocks } -func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { d := &dir{} - d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) + d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - d.dentry.Init(d) - - d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) - - return &d.dentry + d.EnableLeakCheck() + d.IncLinks(d.OrderedChildren.Populate(contents)) + return d } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } // Open implements kernfs.Inode.Open. -func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode.DecRef. +func (d *dir) DecRef(ctx context.Context) { + d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +} + +// StatFS implements kernfs.Inode.StatFS. +func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil +} + // cpuFile implements kernfs.Inode. +// +// +stateify savable type cpuFile struct { + implStatFS kernfs.DynamicBytesFile + maxCores uint } // Generate implements vfs.DynamicBytesSource.Generate. func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "0-%d", c.maxCores-1) + fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) return nil } -func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode { c := &cpuFile{maxCores: maxCores} - c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) - d := &kernfs.Dentry{} - d.Init(c) - return d + c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) + return c +} + +// +stateify savable +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil } diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 4b3602d47..0a0d914cc 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System { AllowUserMount: true, }) - mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } @@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) maxCPUCores := k.ApplicationCores() - expected := fmt.Sprintf("0-%d", maxCPUCores-1) + expected := fmt.Sprintf("0-%d\n", maxCPUCores-1) for _, fname := range []string{"online", "possible", "present"} { pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname)) @@ -59,7 +59,7 @@ func TestReadCPUFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) content, err := s.ReadToEnd(fd) if err != nil { t.Fatalf("Read failed: %v", err) diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index 0e4053a46..400a97996 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -32,6 +32,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/usermem", - "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go_cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index c16a36cdb..738c0c9cc 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("creating platform: %v", err) } + kernel.VFS2Enabled = true k := &kernel.Kernel{ Platform: plat, } @@ -73,7 +74,7 @@ func Boot() (*kernel.Kernel, error) { k.SetMemoryFile(mf) // Pass k as the platform since it is savable, unlike the actual platform. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } @@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } - kernel.VFS2Enabled = true - - if err := k.VFS().Init(); err != nil { - return nil, fmt.Errorf("VFS init: %v", err) - } k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, @@ -126,12 +122,16 @@ func Boot() (*kernel.Kernel, error) { // CreateTask creates a new bare bones task for tests. func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("cannot find kernel from context") + } + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) if err != nil { return nil, err } m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - m.SetExecutable(fsbridge.NewVFSFile(exe)) + m.SetExecutable(ctx, fsbridge.NewVFSFile(exe)) config := &kernel.TaskConfig{ Kernel: k, @@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), FDTable: k.NewFDTable(), } - return k.TaskSet().NewTask(config) + t, err := k.TaskSet().NewTask(ctx, config) + if err != nil { + config.ThreadGroup.Release(ctx) + return nil, err + } + return t, nil } func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) { diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 0556af877..1a8525b06 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -46,16 +46,18 @@ type System struct { // NewSystem constructs a System. // -// Precondition: Caller must hold a reference on MntNs, whose ownership +// Precondition: Caller must hold a reference on mns, whose ownership // is transferred to the new System. func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { + root := mns.Root() + root.IncRef() s := &System{ t: t, Ctx: ctx, Creds: auth.CredentialsFromContext(ctx), VFS: v, MntNs: mns, - Root: mns.Root(), + Root: root, } return s } @@ -97,8 +99,8 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { - s.Root.DecRef() - s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. + s.Root.DecRef(s.Ctx) + s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. @@ -149,7 +151,7 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { if err != nil { s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) collector := &DirentCollector{} if err := fd.IterDirents(s.Ctx, collector); err != nil { @@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error { defer d.mu.Unlock() dirent, ok := d.dirents[name] if !ok { - return fmt.Errorf("No dirent named %q found", name) + return fmt.Errorf("no dirent named %q found", name) } if dirent.Type != typ { - return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent) + return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent) } return nil } diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 60c92d626..8853c8ad2 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -26,12 +26,15 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also // implements ktime.TimerListener. +// +// +stateify savable type TimerFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD events waiter.Queue timer *ktime.Timer @@ -46,9 +49,9 @@ var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) var _ ktime.TimerListener = (*TimerFileDescription)(nil) // New returns a new timer fd. -func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) tfd := &TimerFileDescription{} tfd.timer = ktime.NewTimer(clock, tfd) if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ @@ -61,7 +64,7 @@ func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.F return &tfd.vfsfd, nil } -// Read implements FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { const sizeofUint64 = 8 if dst.NumBytes() < sizeofUint64 { @@ -127,8 +130,8 @@ func (tfd *TimerFileDescription) ResumeTimer() { tfd.timer.Resume() } -// Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { +// Release implements vfs.FileDescriptionImpl.Release. +func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 062321cbc..fe520b6fd 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -26,6 +26,17 @@ go_template_instance( }, ) +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "tmpfs", + prefix = "inode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "inode", + }, +) + go_library( name = "tmpfs", srcs = [ @@ -34,8 +45,10 @@ go_library( "directory.go", "filesystem.go", "fstree.go", + "inode_refs.go", "named_pipe.go", "regular_file.go", + "save_restore.go", "socket_file.go", "symlink.go", "tmpfs.go", @@ -47,6 +60,8 @@ go_library( "//pkg/context", "//pkg/fspath", "//pkg/log", + "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", @@ -62,7 +77,6 @@ go_library( "//pkg/sentry/uniqueid", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 2fb5c4d84..3cc63e732 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -83,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent } err = fn(root, d) - d.DecRef() + d.DecRef(ctx) return err } @@ -105,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) d := root d.IncRef() - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -125,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -136,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -176,24 +176,25 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + root.IncRef() + defer root.DecRef(ctx) vd := root vd.IncRef() for i := depth; i > 0; i-- { @@ -212,7 +213,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -228,12 +229,12 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) vd = vfs.VirtualDentry{} if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -278,14 +279,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create and mount the submount. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { b.Fatalf("failed to create mount point: %v", err) } @@ -293,7 +294,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) if err != nil { b.Fatalf("failed to create tmpfs submount: %v", err) @@ -309,7 +310,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount root: %v", err) } - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -319,7 +320,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -330,7 +331,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -370,24 +371,25 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create the mount point. root := mntns.Root() - defer root.DecRef() + root.IncRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -403,9 +405,9 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create and mount the submount. - if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { + if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) @@ -432,7 +434,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -448,11 +450,11 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - fd.DecRef() + fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go index 83bf885ee..9129d35b7 100644 --- a/pkg/sentry/fsimpl/tmpfs/device_file.go +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) +// +stateify savable type deviceFile struct { inode inode kind vfs.DeviceKind @@ -29,7 +30,7 @@ type deviceFile struct { minor uint32 } -func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode { +func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode { file := &deviceFile{ kind: kind, major: major, @@ -43,7 +44,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode default: panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) } - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, kuid, kgid, mode) file.inode.nlink = 1 // from parent directory return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 70387cb9c..e90669cf0 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// +stateify savable type directory struct { // Since directories can't be hard-linked, each directory can only be // associated with a single dentry, which we can store in the directory @@ -44,21 +45,22 @@ type directory struct { // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() // efficiently. childList is protected by iterMu. - iterMu sync.Mutex + iterMu sync.Mutex `state:"nosave"` childList dentryList } -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory { +func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory { dir := &directory{} - dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode) + dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root dir.dentry.inode = &dir.inode dir.dentry.vfsd.Init(&dir.dentry) return dir } -// Preconditions: filesystem.mu must be locked for writing. dir must not -// already contain a child with the given name. +// Preconditions: +// * filesystem.mu must be locked for writing. +// * dir must not already contain a child with the given name. func (dir *directory) insertChildLocked(child *dentry, name string) { child.parent = &dir.dentry child.name = name @@ -79,9 +81,13 @@ func (dir *directory) removeChildLocked(child *dentry) { dir.iterMu.Lock() dir.childList.Remove(child) dir.iterMu.Unlock() - child.unlinked = true } +func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error { + return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid))) +} + +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl @@ -92,7 +98,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter != nil { dir := fd.inode().impl.(*directory) dir.iterMu.Lock() @@ -107,13 +113,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs := fd.filesystem() dir := fd.inode().impl.(*directory) + defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) + // fs.mu is required to read d.parent and dentry.name. fs.mu.RLock() defer fs.mu.RUnlock() dir.iterMu.Lock() defer dir.iterMu.Unlock() - fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 183eb975c..e39cd305b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -39,8 +38,10 @@ func (fs *filesystem) Sync(ctx context.Context) error { // // stepLocked is loosely analogous to fs/namei.c:walk_component(). // -// Preconditions: filesystem.mu must be locked. !rp.Done(). -func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +// Preconditions: +// * filesystem.mu must be locked. +// * !rp.Done(). +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { return nil, syserror.ENOTDIR @@ -55,13 +56,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -74,12 +75,12 @@ afterSymlink: if !ok { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // Symlink traversal updates access time. - atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds()) + child.inode.touchAtime(rp.Mount()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -97,10 +98,12 @@ afterSymlink: // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // -// Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { +// Preconditions: +// * filesystem.mu must be locked. +// * !rp.Done(). +func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -118,10 +121,10 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: filesystem.mu must be locked. -func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { +func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) for !rp.Done() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -139,12 +142,13 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // doCreateAt is loosely analogous to a conjunction of Linux's // fs/namei.c:filename_create() and done_path_create(). // -// Preconditions: !rp.Done(). For the final path component in rp, -// !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { +// Preconditions: +// * !rp.Done(). +// * For the final path component in rp, !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -182,7 +186,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) + parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) parentDir.inode.touchCMtime() return nil } @@ -191,7 +195,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return err } @@ -202,7 +206,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -222,7 +226,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } @@ -232,35 +236,40 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } d := vd.Dentry().Impl().(*dentry) - if d.inode.isDir() { + i := d.inode + if i.isDir() { return syserror.EPERM } - if d.inode.nlink == 0 { + if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + return err + } + if i.nlink == 0 { return syserror.ENOENT } - if d.inode.nlink == maxLinks { + if i.nlink == maxLinks { return syserror.EMLINK } - d.inode.incLinksLocked() - d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) - parentDir.insertChildLocked(fs.newDentry(d.inode), name) + i.incLinksLocked() + i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) + parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { return syserror.EMLINK } parentDir.inode.incLinksLocked() // from child's ".." - childDir := fs.newDirectory(rp.Credentials(), opts.Mode) + childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) parentDir.insertChildLocked(&childDir.dentry, name) return nil }) @@ -268,19 +277,20 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { - case 0, linux.S_IFREG: - childInode = fs.newRegularFile(rp.Credentials(), opts.Mode) + case linux.S_IFREG: + childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) case linux.S_IFIFO: - childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode) + childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) case linux.S_IFBLK: - childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor) + childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFCHR: - childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) + childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFSOCK: - childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint) + childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint) default: return syserror.EINVAL } @@ -301,30 +311,43 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // don't need fs.mu for writing. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { + fs.mu.RUnlock() return nil, err } + d.IncRef() + defer d.DecRef(ctx) + fs.mu.RUnlock() return d.open(ctx, rp, &opts, false /* afterCreate */) } mustCreate := opts.Flags&linux.O_EXCL != 0 start := rp.Start().Impl().(*dentry) fs.mu.Lock() - defer fs.mu.Unlock() + unlocked := false + unlock := func() { + if !unlocked { + fs.mu.Unlock() + unlocked = true + } + } + defer unlock() if rp.Done() { - // Reject attempts to open directories with O_CREAT. + // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { return nil, syserror.EISDIR } if mustCreate { return nil, syserror.EEXIST } + start.IncRef() + defer start.DecRef(ctx) + unlock() return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parentDir, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(ctx, rp, start) if err != nil { return nil, err } @@ -355,37 +378,46 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + creds := rp.Credentials() + child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)) parentDir.insertChildLocked(child, name) + unlock() fd, err := child.open(ctx, rp, &opts, true) if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) + parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) parentDir.inode.touchCMtime() return fd, nil } + if mustCreate { + return nil, syserror.EEXIST + } // Is the file mounted over? - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // Symlink traversal updates access time. - atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds()) + child.inode.touchAtime(rp.Mount()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } start = &parentDir.dentry goto afterTrailingSymlink } - // Open existing file. - if mustCreate { - return nil, syserror.EEXIST + if rp.MustBeDir() && !child.inode.isDir() { + return nil, syserror.ENOTDIR } + child.IncRef() + defer child.DecRef(ctx) + unlock() return child.open(ctx, rp, &opts, false) } +// Preconditions: The caller must hold no locks (since opening pipes may block +// indefinitely). func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { @@ -396,10 +428,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD - if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + fd.LockFD.Init(&d.inode.locks) + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } - if opts.Flags&linux.O_TRUNC != 0 { + if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { if _, err := impl.truncate(0); err != nil { return nil, err } @@ -411,15 +444,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, syserror.EISDIR } var fd directoryFD - if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + fd.LockFD.Init(&d.inode.locks) + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: - // Can't open symlinks without O_PATH (which is unimplemented). + // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH. return nil, syserror.ELOOP case *namedPipe: - return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: @@ -433,7 +467,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -455,7 +489,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -480,6 +514,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if !ok { return syserror.ENOENT } + if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { + return err + } // Note that we don't need to call rp.CheckMount(), since if renamed is a // mount point then we want to rename the mount point, not anything in the // mounted filesystem. @@ -540,7 +577,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd @@ -551,17 +588,19 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced != nil { newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParentDir.inode.decLinksLocked() // from replaced's ".." + // Remove links for replaced/. and replaced/.. + replaced.inode.decLinksLocked(ctx) + newParentDir.inode.decLinksLocked(ctx) } - replaced.inode.decLinksLocked() + replaced.inode.decLinksLocked(ctx) } oldParentDir.removeChildLocked(renamed) newParentDir.insertChildLocked(renamed, newName) - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) oldParentDir.inode.touchCMtime() if oldParentDir != newParentDir { if renamed.inode.isDir() { - oldParentDir.inode.decLinksLocked() + oldParentDir.inode.decLinksLocked(ctx) newParentDir.inode.incLinksLocked() } newParentDir.inode.touchCMtime() @@ -576,7 +615,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -594,6 +633,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if !ok { return syserror.ENOENT } + if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { + return err + } childDir, ok := child.inode.impl.(*directory) if !ok { return syserror.ENOTDIR @@ -608,17 +650,17 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) + parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Remove links for child, child/., and child/.. - child.inode.decLinksLocked() - child.inode.decLinksLocked() - parentDir.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + child.inode.decLinksLocked(ctx) + parentDir.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -626,17 +668,19 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { + fs.mu.RUnlock() return err } - if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + err = d.inode.setStat(ctx, rp.Credentials(), &opts) + fs.mu.RUnlock() + if err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -645,7 +689,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return linux.Statx{}, err } @@ -658,25 +702,17 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - if _, err := resolveLocked(rp); err != nil { + if _, err := resolveLocked(ctx, rp); err != nil { return linux.Statfs{}, err } - statfs := linux.Statfs{ - Type: linux.TMPFS_MAGIC, - BlockSize: usermem.PageSize, - FragmentSize: usermem.PageSize, - NameLength: linux.NAME_MAX, - // TODO(b/29637826): Allow configuring a tmpfs size and enforce it. - Blocks: 0, - BlocksFree: 0, - } - return statfs, nil + return globalStatfs, nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { - child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() + child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target)) parentDir.insertChildLocked(child, name) return nil }) @@ -686,7 +722,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -701,6 +737,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if !ok { return syserror.ENOENT } + if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { + return err + } if child.inode.isDir() { return syserror.EISDIR } @@ -714,7 +753,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } @@ -722,20 +761,20 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // Generate inotify events. Note that this must take place before the link // count of the child is decremented, or else the watches may be dropped // before these events are added. - vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) parentDir.removeChildLocked(child) - child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -744,63 +783,70 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } switch impl := d.inode.impl.(type) { case *socketFile: + if impl.ep == nil { + return nil, syserror.ECONNREFUSED + } return impl.ep, nil default: return nil, syserror.ECONNREFUSED } } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } - return d.inode.listxattr(size) + return d.inode.listXattr(size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } - return d.inode.getxattr(rp.Credentials(), &opts) + return d.inode.getXattr(rp.Credentials(), &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { + fs.mu.RUnlock() return err } - if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { + err = d.inode.setXattr(rp.Credentials(), &opts) + fs.mu.RUnlock() + if err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { + fs.mu.RUnlock() return err } - if err := d.inode.removexattr(rp.Credentials(), name); err != nil { + err = d.inode.removeXattr(rp.Credentials(), name) + fs.mu.RUnlock() + if err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -819,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } if d.parent == nil { if d.name != "" { - // This must be an anonymous memfd file. + // This file must have been created by + // newUnlinkedRegularFileDescription(). In Linux, + // mm/shmem.c:__shmem_file_setup() => + // fs/file_table.c:alloc_file_pseudo() sets the created + // dentry's dentry_operations to anon_ops, for which d_dname == + // simple_dname. fs/d_path.c:simple_dname() defines the + // dentry's pathname to be its name, prefixed with "/" and + // suffixed with " (deleted)". b.PrependComponent("/" + d.name) + b.AppendString(" (deleted)") return vfs.PrependPathSyntheticError{} } return vfs.PrependPathAtNonMountRootError{} diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 8d77b3fa8..d772db9e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type namedPipe struct { inode inode @@ -28,11 +29,11 @@ type namedPipe struct { } // Preconditions: -// * fs.mu must be locked. -// * rp.Mount().CheckBeginWrite() has been called successfully. -func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { +// * fs.mu must be locked. +// * rp.Mount().CheckBeginWrite() has been called successfully. +func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} - file.inode.init(file, fs, creds, linux.S_IFIFO|mode) + file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 1614f2c39..2f856ce36 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -32,7 +32,7 @@ const fileName = "mypipe" func TestSeparateFDs(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side. This is done in a concurrently because opening // One end the pipe blocks until the other end is opened. @@ -55,13 +55,13 @@ func TestSeparateFDs(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) rfd, ok := <-rfdchan if !ok { t.Fatalf("failed to open pipe for reading %q", fileName) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) const msg = "vamos azul" checkEmpty(ctx, t, rfd) @@ -71,7 +71,7 @@ func TestSeparateFDs(t *testing.T) { func TestNonblockingRead(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side as nonblocking. pop := vfs.PathOperation{ @@ -85,7 +85,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) // Open the write side. openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} @@ -93,7 +93,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) const msg = "geh blau" checkEmpty(ctx, t, rfd) @@ -103,7 +103,7 @@ func TestNonblockingRead(t *testing.T) { func TestNonblockingWriteError(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the write side as nonblocking, which should return ENXIO. pop := vfs.PathOperation{ @@ -121,7 +121,7 @@ func TestNonblockingWriteError(t *testing.T) { func TestSingleFD(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the pipe as readable and writable. pop := vfs.PathOperation{ @@ -135,7 +135,7 @@ func TestSingleFD(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) const msg = "forza blu" checkEmpty(ctx, t, fd) @@ -152,19 +152,20 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } // Create the pipe. root := mntns.Root() + root.IncRef() pop := vfs.PathOperation{ Root: root, Start: root, diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index fee174375..98680fde9 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -37,11 +36,17 @@ import ( ) // regularFile is a regular (=S_IFREG) tmpfs file. +// +// +stateify savable type regularFile struct { inode inode // memFile is a platform.File used to allocate pages to this regularFile. - memFile *pgalloc.MemoryFile + memFile *pgalloc.MemoryFile `state:"nosave"` + + // memoryUsageKind is the memory accounting category under which pages backing + // this regularFile's contents are accounted. + memoryUsageKind usage.MemoryKind // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` @@ -63,7 +68,7 @@ type regularFile struct { writableMappingPages uint64 // dataMu protects the fields below. - dataMu sync.RWMutex + dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into memFile that store // the file's data. @@ -85,16 +90,77 @@ type regularFile struct { size uint64 } -func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { +func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := ®ularFile{ - memFile: fs.memFile, - seals: linux.F_SEAL_SEAL, + memFile: fs.mfp.MemoryFile(), + memoryUsageKind: usage.Tmpfs, + seals: linux.F_SEAL_SEAL, } - file.inode.init(file, fs, creds, linux.S_IFREG|mode) + file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } +// newUnlinkedRegularFileDescription creates a regular file on the tmpfs +// filesystem represented by mount and returns an FD representing that file. +// The new file is not reachable by path traversal from any other file. +// +// newUnlinkedRegularFileDescription is analogous to Linux's +// mm/shmem.c:__shmem_file_setup(). +// +// Preconditions: mount must be a tmpfs mount. +func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") + } + + inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) + d := fs.newDentry(inode) + defer d.DecRef(ctx) + d.name = name + + fd := ®ularFileFD{} + fd.Init(&inode.locks) + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// NewZeroFile creates a new regular file and file description as for +// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is +// initially (implicitly) filled with zeroes. +// +// Preconditions: mount must be a tmpfs mount. +func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { + // Compare mm/shmem.c:shmem_zero_setup(). + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") + if err != nil { + return nil, err + } + rf := fd.inode().impl.(*regularFile) + rf.memoryUsageKind = usage.Anonymous + rf.size = size + return &fd.vfsfd, err +} + +// NewMemfd creates a new regular file and file description as for +// memfd_create. +// +// Preconditions: mount must be a tmpfs mount. +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) + if err != nil { + return nil, err + } + if allowSeals { + fd.inode().impl.(*regularFile).seals = 0 + } + return &fd.vfsfd, nil +} + // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. func (rf *regularFile) truncate(newSize uint64) (bool, error) { @@ -227,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap. optional.End = pgend } - cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { // Newly-allocated pages are zeroed, so we don't need to do anything. return dsts.NumBytes(), nil }) @@ -261,25 +327,50 @@ func (*regularFile) InvalidateUnsavable(context.Context) error { return nil } +// +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 - offMu sync.Mutex + offMu sync.Mutex `state:"nosave"` } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { // noop } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + f := fd.inode().impl.(*regularFile) + + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + oldSize := f.size + size := offset + length + if oldSize >= size { + return nil + } + _, err := f.truncateLocked(size) + return err +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { return 0, syserror.EINVAL } + + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { + return 0, syserror.EOPNOTSUPP + } + if dst.NumBytes() == 0 { return 0, nil } @@ -302,40 +393,60 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset and error. The +// final offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL + } + + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { + return 0, offset, syserror.EOPNOTSUPP } + srclen := src.NumBytes() if srclen == 0 { - return 0, nil + return 0, offset, nil } f := fd.inode().impl.(*regularFile) + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + // If the file is opened with O_APPEND, update offset to file size. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Locking f.inode.mu is sufficient for reading f.size. + offset = int64(f.size) + } if end := offset + srclen; end < offset { // Overflow. - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } - var err error srclen, err = vfs.CheckLimit(ctx, offset, srclen) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(srclen) - f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) - fd.inode().touchCMtimeLocked() - f.inode.mu.Unlock() + f.inode.touchCMtimeLocked() putRegularFileReadWriter(rw) - return n, err + return n, n + offset, err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.offMu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.offMu.Unlock() return n, err } @@ -361,33 +472,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( return offset, nil } -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *regularFileFD) Sync(ctx context.Context) error { - return nil -} - -// LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return fd.inode().lockBSD(uid, t, block) -} - -// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. -func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - fd.inode().unlockBSD(uid) - return nil -} - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return fd.inode().lockPOSIX(uid, t, rng, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - fd.inode().unlockPOSIX(uid, rng) - return nil -} - // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { file := fd.inode().impl.(*regularFile) @@ -559,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, case gap.Ok(): // Allocate memory for the write. gapMR := gap.Range().Intersect(pgMR) - fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind) if err != nil { retErr = err goto exitLoop diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 64e1c40ad..146c7fdfe 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -138,48 +138,37 @@ func TestLocks(t *testing.T) { } defer cleanup() - var ( - uid1 lock.UniqueID - uid2 lock.UniqueID - // Non-blocking. - block lock.Blocker - ) - - uid1 = 123 - uid2 = 456 - - if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil { + uid1 := 123 + uid2 := 456 + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - rng1 := lock.LockRange{0, 1} - rng2 := lock.LockRange{1, 2} - - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } - if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil { + if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil { t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/save_restore.go b/pkg/sentry/fsimpl/tmpfs/save_restore.go new file mode 100644 index 000000000..b27f75cc2 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/save_restore.go @@ -0,0 +1,20 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +// afterLoad is called by stateify. +func (rf *regularFile) afterLoad() { + rf.memFile = rf.inode.fs.mfp.MemoryFile() +} diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go index 25c2321af..5699d5975 100644 --- a/pkg/sentry/fsimpl/tmpfs/socket_file.go +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -21,14 +21,16 @@ import ( ) // socketFile is a socket (=S_IFSOCK) tmpfs file. +// +// +stateify savable type socketFile struct { inode inode ep transport.BoundEndpoint } -func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode { +func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode { file := &socketFile{ep: ep} - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, kuid, kgid, mode) file.inode.nlink = 1 // from parent directory return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index 47e075ed4..a102a2ee2 100644 --- a/pkg/sentry/fsimpl/tmpfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -19,16 +19,17 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) +// +stateify savable type symlink struct { inode inode target string // immutable } -func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode { +func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode { link := &symlink{ target: target, } - link.inode.init(link, fs, creds, linux.S_IFLNK|0777) + link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode) link.inode.nlink = 1 // from parent directory return &link.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index f0e098702..4ce859d57 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -30,6 +30,7 @@ package tmpfs import ( "fmt" "math" + "strconv" "strings" "sync/atomic" @@ -40,7 +41,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -51,14 +51,19 @@ import ( const Name = "tmpfs" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem - // memFile is used to allocate pages to for regular files. - memFile *pgalloc.MemoryFile + // mfp is used to allocate memory that stores regular file contents. mfp is + // immutable. + mfp pgalloc.MemoryFileProvider // clock is a realtime clock used to set timestamps in file operations. clock time.Clock @@ -67,9 +72,11 @@ type filesystem struct { devMinor uint32 // mu serializes changes to the Dentry tree. - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` nextInoMinusOne uint64 // accessed using atomic memory operations + + root *dentry } // Name implements vfs.FilesystemType.Name. @@ -77,7 +84,12 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // FilesystemOpts is used to pass configuration data to tmpfs. +// +// +stateify savable type FilesystemOpts struct { // RootFileType is the FileType of the filesystem root. Valid values // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. @@ -95,8 +107,8 @@ type FilesystemOpts struct { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) - if memFileProvider == nil { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { panic("MemoryFileProviderFromContext returned nil") } @@ -112,13 +124,65 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + mopts := vfs.GenericParseMountOptions(opts.Data) + rootMode := linux.FileMode(0777) + if rootFileType == linux.S_IFDIR { + rootMode = 01777 + } + modeStr, ok := mopts["mode"] + if ok { + delete(mopts, "mode") + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) + return nil, nil, syserror.EINVAL + } + rootMode = linux.FileMode(mode & 07777) + } + rootKUID := creds.EffectiveKUID + uidStr, ok := mopts["uid"] + if ok { + delete(mopts, "uid") + uid, err := strconv.ParseUint(uidStr, 10, 32) + if err != nil { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) + return nil, nil, syserror.EINVAL + } + kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) + if !kuid.Ok() { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) + return nil, nil, syserror.EINVAL + } + rootKUID = kuid + } + rootKGID := creds.EffectiveKGID + gidStr, ok := mopts["gid"] + if ok { + delete(mopts, "gid") + gid, err := strconv.ParseUint(gidStr, 10, 32) + if err != nil { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) + return nil, nil, syserror.EINVAL + } + kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) + if !kgid.Ok() { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) + return nil, nil, syserror.EINVAL + } + rootKGID = kgid + } + if len(mopts) != 0 { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } clock := time.RealtimeClockFromContext(ctx) fs := filesystem{ - memFile: memFileProvider.MemoryFile(), + mfp: mfp, clock: clock, devMinor: devMinor, } @@ -127,15 +191,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var root *dentry switch rootFileType { case linux.S_IFREG: - root = fs.newDentry(fs.newRegularFile(creds, 0777)) + root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode)) case linux.S_IFLNK: - root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)) + root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget)) case linux.S_IFDIR: - root = &fs.newDirectory(creds, 01777).dentry + root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry default: - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } + fs.root = root return &fs.vfsfs, &root.vfsd, nil } @@ -145,11 +210,63 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.mu.Lock() + if fs.root.inode.isDir() { + fs.root.releaseChildrenLocked(ctx) + } + fs.mu.Unlock() +} + +// releaseChildrenLocked is called on the mount point by filesystem.Release() to +// destroy all objects in the mount. It performs a depth-first walk of the +// filesystem and "unlinks" everything by decrementing link counts +// appropriately. There should be no open file descriptors when this is called, +// so each inode should only have one outstanding reference that is removed once +// its link count hits zero. +// +// Note that we do not update filesystem state precisely while tearing down (for +// instance, the child maps are ignored)--we only care to remove all remaining +// references so that every filesystem object gets destroyed. Also note that we +// do not need to trigger DecRef on the mount point itself or any child mount; +// these are taken care of by the destructor of the enclosing MountNamespace. +// +// Precondition: filesystem.mu is held. +func (d *dentry) releaseChildrenLocked(ctx context.Context) { + dir := d.inode.impl.(*directory) + for _, child := range dir.childMap { + if child.inode.isDir() { + child.releaseChildrenLocked(ctx) + child.inode.decLinksLocked(ctx) // link for child/. + dir.inode.decLinksLocked(ctx) // link for child/.. + } + child.inode.decLinksLocked(ctx) // link for child + } +} + +// immutable +var globalStatfs = linux.Statfs{ + Type: linux.TMPFS_MAGIC, + BlockSize: usermem.PageSize, + FragmentSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + + // tmpfs currently does not support configurable size limits. In Linux, + // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from + // statfs(2). However, many applications treat this as having a size limit + // of 0. To work around this, claim to have a very large but non-zero size, + // chosen to ensure that BlockSize * Blocks does not overflow int64 (which + // applications may also handle incorrectly). + // TODO(b/29637826): allow configuring a tmpfs size and enforce it. + Blocks: math.MaxInt64 / usermem.PageSize, + BlocksFree: math.MaxInt64 / usermem.PageSize, + BlocksAvailable: math.MaxInt64 / usermem.PageSize, } // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -163,11 +280,6 @@ type dentry struct { // filesystem.mu. name string - // unlinked indicates whether this dentry has been unlinked from its parent. - // It is only set to true on an unlink operation, and never set from true to - // false. unlinked is protected by filesystem.mu. - unlinked bool - // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry @@ -202,23 +314,27 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { - d.inode.decRef() +func (d *dentry) DecRef(ctx context.Context) { + d.inode.decRef(ctx) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } + // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates + // that d was deleted. + deleted := d.vfsd.IsDead() + + d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - // Note that d.parent or d.name may be stale if there is a concurrent - // rename operation. Inotify does not provide consistency guarantees. - d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked) + d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) } - d.inode.watches.Notify("", events, cookie, et) + d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) + d.inode.fs.mu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. @@ -226,18 +342,20 @@ func (d *dentry) Watches() *vfs.Watches { return &d.inode.watches } +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +func (d *dentry) OnZeroWatches(context.Context) {} + // inode represents a filesystem object. +// +// +stateify savable type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem - // refs is a reference count. refs is accessed using atomic memory - // operations. - // // A reference is held on all inodes as long as they are reachable in the // filesystem tree, i.e. nlink is nonzero. This reference is dropped when // nlink reaches 0. - refs int64 + refs inodeRefs // xattrs implements extended attributes. // @@ -246,20 +364,19 @@ type inode struct { // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. - mu sync.Mutex - mode uint32 // file type and mode - nlink uint32 // protected by filesystem.mu instead of inode.mu - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - ino uint64 // immutable + mu sync.Mutex `state:"nosave"` + mode uint32 // file type and mode + nlink uint32 // protected by filesystem.mu instead of inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable // Linux's tmpfs has no concept of btime. atime int64 // nanoseconds ctime int64 // nanoseconds mtime int64 // nanoseconds - // Advisory file locks, which lock at the inode level. - locks lock.FileLocks + locks vfs.FileLocks // Inotify watches for this inode. watches vfs.Watches @@ -269,15 +386,14 @@ type inode struct { const maxLinks = math.MaxUint32 -func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { +func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) { if mode.FileType() == 0 { panic("file type is required in FileMode") } i.fs = fs - i.refs = 1 i.mode = uint32(mode) - i.uid = uint32(creds.EffectiveKUID) - i.gid = uint32(creds.EffectiveKGID) + i.uid = uint32(kuid) + i.gid = uint32(kgid) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. now := fs.clock.Now().Nanoseconds() @@ -285,14 +401,16 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.ctime = now i.mtime = now // i.nlink initialized by caller - i.watches = vfs.Watches{} i.impl = impl + i.refs.EnableLeakCheck() } // incLinksLocked increments i's link count. // -// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -// i.nlink < maxLinks. +// Preconditions: +// * filesystem.mu must be locked for writing. +// * i.nlink != 0. +// * i.nlink < maxLinks. func (i *inode) incLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.incLinksLocked() called with no existing links") @@ -306,46 +424,36 @@ func (i *inode) incLinksLocked() { // decLinksLocked decrements i's link count. If the link count reaches 0, we // remove a reference on i as well. // -// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -func (i *inode) decLinksLocked() { +// Preconditions: +// * filesystem.mu must be locked for writing. +// * i.nlink != 0. +func (i *inode) decLinksLocked(ctx context.Context) { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { - i.decRef() + i.decRef(ctx) } } func (i *inode) incRef() { - if atomic.AddInt64(&i.refs, 1) <= 1 { - panic("tmpfs.inode.incRef() called without holding a reference") - } + i.refs.IncRef() } func (i *inode) tryIncRef() bool { - for { - refs := atomic.LoadInt64(&i.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { - return true - } - } + return i.refs.TryIncRef() } -func (i *inode) decRef() { - if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - i.watches.HandleDeletion() +func (i *inode) decRef(ctx context.Context) { + i.refs.DecRef(func() { + i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any // metadata. regFile.data.DropAll(regFile.memFile) } - } else if refs < 0 { - panic("tmpfs.inode.decRef() called without holding a reference") - } + }) } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { @@ -400,7 +508,8 @@ func (i *inode) statTo(stat *linux.Statx) { } } -func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { +func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { + stat := &opts.Stat if stat.Mask == 0 { return nil } @@ -408,7 +517,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) - if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() @@ -486,44 +595,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return nil } -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - switch i.impl.(type) { - case *regularFile: - return i.locks.LockBSD(uid, t, block) - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) unlockBSD(uid fslock.UniqueID) error { - switch i.impl.(type) { - case *regularFile: - i.locks.UnlockBSD(uid) - return nil - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - switch i.impl.(type) { - case *regularFile: - return i.locks.LockPOSIX(uid, t, rng, block) - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { - switch i.impl.(type) { - case *regularFile: - i.locks.UnlockPOSIX(uid, rng) - return nil - } - return syserror.EBADF -} - // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block @@ -543,6 +614,8 @@ func (i *inode) direntType() uint8 { return linux.DT_LNK case *socketFile: return linux.DT_SOCK + case *namedPipe: + return linux.DT_FIFO case *deviceFile: switch impl.kind { case vfs.BlockDevice: @@ -562,6 +635,9 @@ func (i *inode) isDir() bool { } func (i *inode) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } if err := mnt.CheckBeginWrite(); err != nil { return } @@ -589,69 +665,63 @@ func (i *inode) touchCMtime() { i.mu.Unlock() } -// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds -// inode.mu. +// Preconditions: +// * The caller has called vfs.Mount.CheckBeginWrite(). +// * inode.mu must be locked. func (i *inode) touchCMtimeLocked() { now := i.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) } -func (i *inode) listxattr(size uint64) ([]string, error) { - return i.xattrs.Listxattr(size) +func (i *inode) listXattr(size uint64) ([]string, error) { + return i.xattrs.ListXattr(size) } -func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { - if err := i.checkPermissions(creds, vfs.MayRead); err != nil { +func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { + if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return "", syserror.EOPNOTSUPP - } - if !i.userXattrSupported() { - return "", syserror.ENODATA - } - return i.xattrs.Getxattr(opts) + return i.xattrs.GetXattr(opts) } -func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { - if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { +func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { + if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP - } - if !i.userXattrSupported() { - return syserror.EPERM - } - return i.xattrs.Setxattr(opts) + return i.xattrs.SetXattr(opts) } -func (i *inode) removexattr(creds *auth.Credentials, name string) error { - if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { +func (i *inode) removeXattr(creds *auth.Credentials, name string) error { + if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return i.xattrs.RemoveXattr(name) +} + +func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { + // We currently only support extended attributes in the user.* and + // trusted.* namespaces. See b/148380782. + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { return syserror.EOPNOTSUPP } - if !i.userXattrSupported() { - return syserror.EPERM + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + kuid := auth.KUID(atomic.LoadUint32(&i.uid)) + kgid := auth.KGID(atomic.LoadUint32(&i.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err } - return i.xattrs.Removexattr(name) -} - -// Extended attributes in the user.* namespace are only supported for regular -// files and directories. -func (i *inode) userXattrSupported() bool { - filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) - return filetype == linux.S_IFREG || filetype == linux.S_IFDIR + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) } // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { @@ -677,77 +747,67 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) d := fd.dentry() - if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + if err := d.inode.setStat(ctx, creds, &opts); err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } -// Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { - return fd.inode().listxattr(size) +// StatFS implements vfs.FileDescriptionImpl.StatFS. +func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + return globalStatfs, nil +} + +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listXattr(size) } -// Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { - return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) } -// Setxattr implements vfs.FileDescriptionImpl.Setxattr. -func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { d := fd.dentry() - if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil { return err } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// Removexattr implements vfs.FileDescriptionImpl.Removexattr. -func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { d := fd.dentry() - if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil { return err } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// NewMemfd creates a new tmpfs regular file and file description that can back -// an anonymous fd created by memfd_create. -func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { - fs, ok := mount.Filesystem().Impl().(*filesystem) - if !ok { - panic("NewMemfd() called with non-tmpfs mount") - } - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with - // S_IRWXUGO. - mode := linux.FileMode(0777) - inode := fs.newRegularFile(creds, mode) - rf := inode.impl.(*regularFile) - if allowSeals { - rf.seals = 0 - } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} - d := fs.newDentry(inode) - defer d.DecRef() - d.name = name +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with - // FMODE_READ | FMODE_WRITE. - var fd regularFileFD - flags := uint32(linux.O_RDWR) - if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil +// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all +// filesystem state is in-memory. +func (*fileDescription) Sync(context.Context) error { + return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index a240fb276..fc5323abc 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -34,21 +34,22 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } root := mntns.Root() + root.IncRef() return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() + root.DecRef(ctx) + mntns.DecRef(ctx) }, nil } diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD new file mode 100644 index 000000000..e265be0ee --- /dev/null +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -0,0 +1,51 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "verity", + srcs = [ + "filesystem.go", + "save_restore.go", + "verity.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/marshal/primitive", + "//pkg/merkletree", + "//pkg/refsvfs2", + "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) + +go_test( + name = "verity_test", + srcs = [ + "verity_test.go", + ], + library = ":verity", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go new file mode 100644 index 000000000..2f6050cfd --- /dev/null +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -0,0 +1,1046 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "bytes" + "fmt" + "io" + "strconv" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/merkletree" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All files should be read-only. + return nil +} + +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls +// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may have a reference count of zero, and which therefore +// should be dropped once traversal is complete, are appended to ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + child, err := fs.getChildLocked(ctx, d, name, ds) + if err != nil { + return nil, err + } + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { + return nil, err + } + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// verifyChild verifies the hash of child against the already verified hash of +// the parent to ensure the child is expected. verifyChild triggers a sentry +// panic if unexpected modifications to the file system are detected. In +// noCrashOnVerificationFailure mode it returns a syserror instead. +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// TODO(b/166474175): Investigate all possible errors returned in this +// function, and make sure we differentiate all errors that indicate unexpected +// modifications to the file system from the ones that are not harmful. +func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) { + vfsObj := fs.vfsfs.VirtualFilesystem() + + // Get the path to the child dentry. This is only used to provide path + // information in failure case. + childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD) + if err != nil { + return nil, err + } + + fs.verityMu.RLock() + defer fs.verityMu.RUnlock() + // Read the offset of the child from the extended attributes of the + // corresponding Merkle tree file. + // This is the offset of the hash for child in its parent's Merkle tree + // file. + off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.lowerMerkleVD, + Start: child.lowerMerkleVD, + }, &vfs.GetXattrOptions{ + Name: merkleOffsetInParentXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the file or the xattr does not + // exist, it indicates unexpected modifications to the file system. + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err)) + } + if err != nil { + return nil, err + } + // The offset xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + offset, err := strconv.Atoi(off) + if err != nil { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err)) + } + + // Open parent Merkle tree file to read and verify child's hash. + parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerMerkleVD, + Start: parent.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + + // The parent Merkle tree file should have been created. If it's + // missing, it indicates an unexpected modification to the file system. + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err)) + } + if err != nil { + return nil, err + } + + // dataSize is the size of raw data for the Merkle tree. For a file, + // dataSize is the size of the whole file. For a directory, dataSize is + // the size of all its children's hashes. + dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the file or the xattr does not + // exist, it indicates unexpected modifications to the file system. + if err == syserror.ENOENT || err == syserror.ENODATA { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err)) + } + if err != nil { + return nil, err + } + + // The dataSize xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + parentSize, err := strconv.Atoi(dataSize) + if err != nil { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) + } + + fdReader := vfs.FileReadWriteSeeker{ + FD: parentMerkleFD, + Ctx: ctx, + } + + parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerVD, + Start: parent.lowerVD, + }, &vfs.StatOptions{}) + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err)) + } + if err != nil { + return nil, err + } + + // Since we are verifying against a directory Merkle tree, buf should + // contain the hash of the children in the parent Merkle tree when + // Verify returns with success. + var buf bytes.Buffer + if _, err := merkletree.Verify(&merkletree.VerifyParams{ + Out: &buf, + File: &fdReader, + Tree: &fdReader, + Size: int64(parentSize), + Name: parent.name, + Mode: uint32(parentStat.Mode), + UID: parentStat.UID, + GID: parentStat.GID, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, + ReadOffset: int64(offset), + ReadSize: int64(merkletree.DigestSize(linux.FS_VERITY_HASH_ALG_SHA256)), + Expected: parent.hash, + DataAndTreeInSameFile: true, + }); err != nil && err != io.EOF { + return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err)) + } + + // Cache child hash when it's verified the first time. + if len(child.hash) == 0 { + child.hash = buf.Bytes() + } + return child, nil +} + +// verifyStat verifies the stat against the verified hash. The mode/uid/gid of +// the file is cached after verified. +func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + + // Get the path to the child dentry. This is only used to provide path + // information in failure case. + childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD) + if err != nil { + return err + } + + fs.verityMu.RLock() + defer fs.verityMu.RUnlock() + + fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err == syserror.ENOENT { + return alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err)) + } + if err != nil { + return err + } + + merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + if err == syserror.ENODATA { + return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err)) + } + if err != nil { + return err + } + + size, err := strconv.Atoi(merkleSize) + if err != nil { + return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err)) + } + + fdReader := vfs.FileReadWriteSeeker{ + FD: fd, + Ctx: ctx, + } + + var buf bytes.Buffer + params := &merkletree.VerifyParams{ + Out: &buf, + Tree: &fdReader, + Size: int64(size), + Name: d.name, + Mode: uint32(stat.Mode), + UID: stat.UID, + GID: stat.GID, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, + ReadOffset: 0, + // Set read size to 0 so only the metadata is verified. + ReadSize: 0, + Expected: d.hash, + DataAndTreeInSameFile: false, + } + if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR { + params.DataAndTreeInSameFile = true + } + + if _, err := merkletree.Verify(params); err != nil && err != io.EOF { + return alertIntegrityViolation(fmt.Sprintf("Verification stat for %s failed: %v", childPath, err)) + } + d.mode = uint32(stat.Mode) + d.uid = stat.UID + d.gid = stat.GID + d.size = uint32(size) + return nil +} + +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { + if child, ok := parent.children[name]; ok { + // If verity is enabled on child, we should check again whether + // the file and the corresponding Merkle tree are as expected, + // in order to catch deletion/renaming after the last time it's + // accessed. + if child.verityEnabled() { + vfsObj := fs.vfsfs.VirtualFilesystem() + // Get the path to the child dentry. This is only used + // to provide path information in failure case. + path, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD) + if err != nil { + return nil, err + } + + childVD, err := parent.getLowerAt(ctx, vfsObj, name) + if err == syserror.ENOENT { + // The file was previously accessed. If the + // file does not exist now, it indicates an + // unexpected modification to the file system. + return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", path)) + } + if err != nil { + return nil, err + } + defer childVD.DecRef(ctx) + + childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + // The Merkle tree file was previous accessed. If it + // does not exist now, it indicates an unexpected + // modification to the file system. + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path)) + } + if err != nil { + return nil, err + } + + defer childMerkleVD.DecRef(ctx) + } + + // If enabling verification on files/directories is not allowed + // during runtime, all cached children are already verified. If + // runtime enable is allowed and the parent directory is + // enabled, we should verify the child hash here because it may + // be cached before enabled. + if fs.allowRuntimeEnable { + if parent.verityEnabled() { + if _, err := fs.verifyChild(ctx, parent, child); err != nil { + return nil, err + } + } + if child.verityEnabled() { + vfsObj := fs.vfsfs.VirtualFilesystem() + mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.lowerVD, + Start: child.lowerVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + return nil, err + } + if err := fs.verifyStat(ctx, child, stat); err != nil { + return nil, err + } + } + } + return child, nil + } + child, err := fs.lookupAndVerifyLocked(ctx, parent, name) + if err != nil { + return nil, err + } + if parent.children == nil { + parent.children = make(map[string]*dentry) + } + parent.children[name] = child + // child's refcount is initially 0, so it may be dropped after traversal. + *ds = appendDentry(*ds, child) + return child, nil +} + +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { + vfsObj := fs.vfsfs.VirtualFilesystem() + + childVD, childErr := parent.getLowerAt(ctx, vfsObj, name) + // We will handle ENOENT separately, as it may indicate unexpected + // modifications to the file system, and may cause a sentry panic. + if childErr != nil && childErr != syserror.ENOENT { + return nil, childErr + } + + // The dentry needs to be cleaned up if any error occurs. IncRef will be + // called if a verity child dentry is successfully created. + if childErr == nil { + defer childVD.DecRef(ctx) + } + + childMerkleVD, childMerkleErr := parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + // We will handle ENOENT separately, as it may indicate unexpected + // modifications to the file system, and may cause a sentry panic. + if childMerkleErr != nil && childMerkleErr != syserror.ENOENT { + return nil, childMerkleErr + } + + // The dentry needs to be cleaned up if any error occurs. IncRef will be + // called if a verity child dentry is successfully created. + if childMerkleErr == nil { + defer childMerkleVD.DecRef(ctx) + } + + // Get the path to the parent dentry. This is only used to provide path + // information in failure case. + parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD) + if err != nil { + return nil, err + } + + // TODO(b/166474175): Investigate all possible errors of childErr and + // childMerkleErr, and make sure we differentiate all errors that + // indicate unexpected modifications to the file system from the ones + // that are not harmful. + if childErr == syserror.ENOENT && childMerkleErr == nil { + // Failed to get child file/directory dentry. However the + // corresponding Merkle tree is found. This indicates an + // unexpected modification to the file system that + // removed/renamed the child. + return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name)) + } else if childErr == nil && childMerkleErr == syserror.ENOENT { + // If in allowRuntimeEnable mode, and the Merkle tree file is + // not created yet, we create an empty Merkle tree file, so that + // if the file is enabled through ioctl, we have the Merkle tree + // file open and ready to use. + // This may cause empty and unused Merkle tree files in + // allowRuntimeEnable mode, if they are never enabled. This + // does not affect verification, as we rely on cached hash to + // decide whether to perform verification, not the existence of + // the Merkle tree file. Also, those Merkle tree files are + // always hidden and cannot be accessed by verity fs users. + if fs.allowRuntimeEnable { + childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parent.lowerVD, + Start: parent.lowerVD, + Path: fspath.Parse(merklePrefix + name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, + }) + if err != nil { + return nil, err + } + childMerkleFD.DecRef(ctx) + childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name) + if err != nil { + return nil, err + } + } else { + // If runtime enable is not allowed. This indicates an + // unexpected modification to the file system that + // removed/renamed the Merkle tree file. + return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name)) + } + } else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT { + // Both the child and the corresponding Merkle tree are missing. + // This could be an unexpected modification or due to incorrect + // parameter. + // TODO(b/167752508): Investigate possible ways to differentiate + // cases that both files are deleted from cases that they never + // exist in the file system. + return nil, alertIntegrityViolation(fmt.Sprintf("Failed to find file %s", parentPath+"/"+name)) + } + + mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + return nil, err + } + + child := fs.newDentry() + child.lowerVD = childVD + child.lowerMerkleVD = childMerkleVD + + // Increase the reference for both childVD and childMerkleVD as they are + // held by child. If this function fails and the child is destroyed, the + // references will be decreased in destroyLocked. + childVD.IncRef() + childMerkleVD.IncRef() + + parent.IncRef() + child.parent = parent + child.name = name + + child.mode = uint32(stat.Mode) + child.uid = stat.UID + child.gid = stat.GID + + // Verify child hash. This should always be performed unless in + // allowRuntimeEnable mode and the parent directory hasn't been enabled + // yet. + if parent.verityEnabled() { + if _, err := fs.verifyChild(ctx, parent, child); err != nil { + child.destroyLocked(ctx) + return nil, err + } + } + if child.verityEnabled() { + if err := fs.verifyStat(ctx, child, stat); err != nil { + child.destroyLocked(ctx) + return nil, err + } + } + + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + // Verity file system is read-only. + if ats&vfs.MayWrite != 0 { + return syserror.EROFS + } + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Verity fs is read-only. + if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 { + return nil, syserror.EROFS + } + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + + start := rp.Start().Impl().(*dentry) + if rp.Done() { + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + + // Open existing child or follow symlink. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds) + parent.dirMu.Unlock() + if err != nil { + return nil, err + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + // Users should not open the Merkle tree files. Those are for verity fs + // use only. + if strings.Contains(d.name, merklePrefix) { + return nil, syserror.EPERM + } + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + + // Verity fs is read-only. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EROFS + } + + // Get the path to the target file. This is only used to provide path + // information in failure case. + path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD) + if err != nil { + return nil, err + } + + // Open the file in the underlying file system. + lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, opts) + + // The file should exist, as we succeeded in finding its dentry. If it's + // missing, it indicates an unexpected modification to the file system. + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path)) + } + return nil, err + } + + // lowerFD needs to be cleaned up if any error occurs. IncRef will be + // called if a verity FD is successfully created. + defer lowerFD.DecRef(ctx) + + // Open the Merkle tree file corresponding to the current file/directory + // to be used later for verifying Read/Walk. + merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + + // The Merkle tree file should exist, as we succeeded in finding its + // dentry. If it's missing, it indicates an unexpected modification to + // the file system. + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path)) + } + return nil, err + } + + // merkleReader needs to be cleaned up if any error occurs. IncRef will + // be called if a verity FD is successfully created. + defer merkleReader.DecRef(ctx) + + lowerFlags := lowerFD.StatusFlags() + lowerFDOpts := lowerFD.Options() + var merkleWriter *vfs.FileDescription + var parentMerkleWriter *vfs.FileDescription + + // Only open the Merkle tree files for write if in allowRuntimeEnable + // mode. + if d.fs.allowRuntimeEnable { + merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerMerkleVD, + Start: d.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path)) + } + return nil, err + } + // merkleWriter is cleaned up if any error occurs. IncRef will + // be called if a verity FD is created successfully. + defer merkleWriter.DecRef(ctx) + + if d.parent != nil { + parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.parent.lowerMerkleVD, + Start: d.parent.lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_APPEND, + }) + if err != nil { + if err == syserror.ENOENT { + parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD) + return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath)) + } + return nil, err + } + // parentMerkleWriter is cleaned up if any error occurs. IncRef + // will be called if a verity FD is created successfully. + defer parentMerkleWriter.DecRef(ctx) + } + } + + fd := &fileDescription{ + d: d, + lowerFD: lowerFD, + merkleReader: merkleReader, + merkleWriter: merkleWriter, + parentMerkleWriter: parentMerkleWriter, + isDir: d.isDir(), + } + + if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil { + return nil, err + } + lowerFD.IncRef() + merkleReader.IncRef() + if merkleWriter != nil { + merkleWriter.IncRef() + } + if parentMerkleWriter != nil { + parentMerkleWriter.IncRef() + } + return &fd.vfsfd, err +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + //TODO(b/162787271): Provide integrity check for ReadlinkAt. + return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should +// be verified. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + + var stat linux.Statx + stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, &opts) + if err != nil { + return linux.Statx{}, err + } + if d.verityEnabled() { + if err := fs.verifyStat(ctx, d, stat); err != nil { + return linux.Statx{}, err + } + } + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + // TODO(b/159261227): Implement StatFSAt. + return linux.Statfs{}, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + lowerVD := d.lowerVD + return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, size) +} + +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + lowerVD := d.lowerVD + return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &opts) +} + +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/fsimpl/verity/save_restore.go b/pkg/sentry/fsimpl/verity/save_restore.go new file mode 100644 index 000000000..4a161163c --- /dev/null +++ b/pkg/sentry/fsimpl/verity/save_restore.go @@ -0,0 +1,27 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/refsvfs2" +) + +func (d *dentry) afterLoad() { + if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 { + refsvfs2.Register(d, "verity.dentry") + } +} diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go new file mode 100644 index 000000000..92ca6ca6b --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -0,0 +1,880 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package verity provides a filesystem implementation that is a wrapper of +// another file system. +// The verity file system provides integrity check for the underlying file +// system by providing verification for path traversals and each read. +// The verity file system is read-only, except for one case: when +// allowRuntimeEnable is true, additional Merkle files can be generated using +// the FS_IOC_ENABLE_VERITY ioctl. +package verity + +import ( + "fmt" + "math" + "strconv" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/merkletree" + "gvisor.dev/gvisor/pkg/refsvfs2" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Name is the default filesystem name. +const Name = "verity" + +// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle +// tree file for "/foo" is "/.merkle.verity.foo". +const merklePrefix = ".merkle.verity." + +// merkleoffsetInParentXattr is the extended attribute name specifying the +// offset of child hash in its parent's Merkle tree. +const merkleOffsetInParentXattr = "user.merkle.offset" + +// merkleSizeXattr is the extended attribute name specifying the size of data +// hashed by the corresponding Merkle tree. For a file, it's the size of the +// whole file. For a directory, it's the size of all its children's hashes. +const merkleSizeXattr = "user.merkle.size" + +// sizeOfStringInt32 is the size for a 32 bit integer stored as string in +// extended attributes. The maximum value of a 32 bit integer is 10 digits. +const sizeOfStringInt32 = 10 + +// noCrashOnVerificationFailure indicates whether the sandbox should panic +// whenever verification fails. If true, an error is returned instead of +// panicking. This should only be set for tests. +// TOOD(b/165661693): Decide whether to panic or return error based on this +// flag. +var noCrashOnVerificationFailure bool + +// FilesystemType implements vfs.FilesystemType. +// +// +stateify savable +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +// +// +stateify savable +type filesystem struct { + vfsfs vfs.Filesystem + + // creds is a copy of the filesystem's creator's credentials, which are + // used for accesses to the underlying file system. creds is immutable. + creds *auth.Credentials + + // allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY + // to build Merkle trees in the verity file system is allowed. If this + // is false, no new Merkle trees can be built, and only the files that + // had Merkle trees before startup (e.g. from a host filesystem mounted + // with gofer fs) can be verified. + allowRuntimeEnable bool + + // lowerMount is the underlying file system mount. + lowerMount *vfs.Mount + + // rootDentry is the mount root Dentry for this file system, which + // stores the root hash of the whole file system in bytes. + rootDentry *dentry + + // renameMu synchronizes renaming with non-renaming operations in order + // to ensure consistent lock ordering between dentry.dirMu in different + // dentries. + renameMu sync.RWMutex `state:"nosave"` + + // verityMu synchronizes enabling verity files, protects files or + // directories from being enabled by different threads simultaneously. + // It also ensures that verity does not access files that are being + // enabled. + // + // Also, the directory Merkle trees depends on the generated trees of + // its children. So they shouldn't be enabled the same time. This lock + // is for the whole file system to ensure that no more than one file is + // enabled the same time. + verityMu sync.RWMutex +} + +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +// +// +stateify savable +type InternalFilesystemOptions struct { + // RootMerkleFileName is the name of the verity root Merkle tree file. + RootMerkleFileName string + + // LowerName is the name of the filesystem wrapped by verity fs. + LowerName string + + // RootHash is the root hash of the overall verity file system. + RootHash []byte + + // AllowRuntimeEnable specifies whether the verity file system allows + // enabling verification for files (i.e. building Merkle trees) during + // runtime. + AllowRuntimeEnable bool + + // LowerGetFSOptions is the file system option for the lower layer file + // system wrapped by verity file system. + LowerGetFSOptions vfs.GetFilesystemOptions + + // NoCrashOnVerificationFailure indicates whether the sandbox should + // panic whenever verification fails. If true, an error is returned + // instead of panicking. This should only be set for tests. + NoCrashOnVerificationFailure bool +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// alertIntegrityViolation alerts a violation of integrity, which usually means +// unexpected modification to the file system is detected. In +// noCrashOnVerificationFailure mode, it returns EIO, otherwise it panic. +func alertIntegrityViolation(msg string) error { + if noCrashOnVerificationFailure { + return syserror.EIO + } + panic(msg) +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + iopts, ok := opts.InternalData.(InternalFilesystemOptions) + if !ok { + ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs") + return nil, nil, syserror.EINVAL + } + noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure + + // Mount the lower file system. The lower file system is wrapped inside + // verity, and should not be exposed or connected. + mopts := &vfs.MountOptions{ + GetFilesystemOptions: iopts.LowerGetFSOptions, + InternalMount: true, + } + mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts) + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + creds: creds.Fork(), + lowerMount: mnt, + allowRuntimeEnable: iopts.AllowRuntimeEnable, + } + fs.vfsfs.Init(vfsObj, &fstype, fs) + + // Construct the root dentry. + d := fs.newDentry() + d.refs = 1 + lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root()) + lowerVD.IncRef() + d.lowerVD = lowerVD + + rootMerkleName := merklePrefix + iopts.RootMerkleFileName + + lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + Path: fspath.Parse(rootMerkleName), + }, &vfs.GetDentryOptions{}) + + // If runtime enable is allowed, the root merkle tree may be absent. We + // should create the tree file. + if err == syserror.ENOENT && fs.allowRuntimeEnable { + lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + Path: fspath.Parse(rootMerkleName), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT, + Mode: 0644, + }) + if err != nil { + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, err + } + lowerMerkleFD.DecRef(ctx) + lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + Path: fspath.Parse(rootMerkleName), + }, &vfs.GetDentryOptions{}) + if err != nil { + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, err + } + } else if err != nil { + // Failed to get dentry for the root Merkle file. This + // indicates an unexpected modification that removed/renamed + // the root Merkle file, or it's never generated. + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, alertIntegrityViolation("Failed to find root Merkle file") + } + d.lowerMerkleVD = lowerMerkleVD + + // Get metadata from the underlying file system. + const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &vfs.StatOptions{ + Mask: statMask, + }) + if err != nil { + fs.vfsfs.DecRef(ctx) + d.DecRef(ctx) + return nil, nil, err + } + + d.mode = uint32(stat.Mode) + d.uid = stat.UID + d.gid = stat.GID + d.hash = make([]byte, len(iopts.RootHash)) + + if !fs.allowRuntimeEnable { + if err := fs.verifyStat(ctx, d, stat); err != nil { + return nil, nil, err + } + } + + copy(d.hash, iopts.RootHash) + d.vfsd.Init(d) + + fs.rootDentry = d + + return &fs.vfsfs, &d.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + fs.lowerMount.DecRef(ctx) +} + +// dentry implements vfs.DentryImpl. +// +// +stateify savable +type dentry struct { + vfsd vfs.Dentry + + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // mode, uid, gid and size are the file mode, owner, group, and size of + // the file in the underlying file system. + mode uint32 + uid uint32 + gid uint32 + size uint32 + + // parent is the dentry corresponding to this dentry's parent directory. + // name is this dentry's name in parent. If this dentry is a filesystem + // root, parent is nil and name is the empty string. parent and name are + // protected by fs.renameMu. + parent *dentry + name string + + // If this dentry represents a directory, children maps the names of + // children for which dentries have been instantiated to those dentries, + // and dirents (if not nil) is a cache of dirents as returned by + // directoryFDs representing this directory. children is protected by + // dirMu. + dirMu sync.Mutex `state:"nosave"` + children map[string]*dentry + + // lowerVD is the VirtualDentry in the underlying file system. + lowerVD vfs.VirtualDentry + + // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree + // in the underlying file system. + lowerMerkleVD vfs.VirtualDentry + + // hash is the calculated hash for the current file or directory. + hash []byte +} + +// newDentry creates a new dentry representing the given verity file. The +// dentry initially has no references; it is the caller's responsibility to set +// the dentry's reference count and/or call dentry.destroy() as appropriate. +// The dentry is initially invalid in that it contains no underlying dentry; +// the caller is responsible for setting them. +func (fs *filesystem) newDentry() *dentry { + d := &dentry{ + fs: fs, + } + d.vfsd.Init(d) + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Register(d, "verity.dentry") + } + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef(ctx context.Context) { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked(ctx) + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("verity.dentry.DecRef() called without holding a reference") + } +} + +// checkDropLocked should be called after d's reference count becomes 0 or it +// becomes deleted. +func (d *dentry) checkDropLocked(ctx context.Context) { + // Dentries with a positive reference count must be retained. Dentries + // with a negative reference count have already been destroyed. + if atomic.LoadInt64(&d.refs) != 0 { + return + } + // Refs is still zero; destroy it. + d.destroyLocked(ctx) + return +} + +// destroyLocked destroys the dentry. +// +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +func (d *dentry) destroyLocked(ctx context.Context) { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("verity.dentry.destroyLocked() called on already destroyed dentry") + default: + panic("verity.dentry.destroyLocked() called with references on the dentry") + } + + if d.lowerVD.Ok() { + d.lowerVD.DecRef(ctx) + } + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Unregister(d, "verity.dentry") + } + + if d.lowerMerkleVD.Ok() { + d.lowerMerkleVD.DecRef(ctx) + } + + if d.parent != nil { + d.parent.dirMu.Lock() + if !d.vfsd.IsDead() { + delete(d.parent.children, d.name) + } + d.parent.dirMu.Unlock() + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkDropLocked(ctx) + } else if refs < 0 { + panic("verity.dentry.DecRef() called without holding a reference") + } + } +} + +// LeakMessage implements refsvfs2.CheckedObject.LeakMessage. +func (d *dentry) LeakMessage() string { + return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs)) +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { + //TODO(b/159261227): Implement InotifyWithParent. +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + //TODO(b/159261227): Implement Watches. + return nil +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +func (d *dentry) OnZeroWatches(context.Context) { + //TODO(b/159261227): Implement OnZeroWatches. +} + +func (d *dentry) isSymlink() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK +} + +func (d *dentry) isDir() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// verityEnabled checks whether the file is enabled with verity features. It +// should always be true if runtime enable is not allowed. In runtime enable +// mode, it returns true if the target has been enabled with +// ioctl(FS_IOC_ENABLE_VERITY). +func (d *dentry) verityEnabled() bool { + return !d.fs.allowRuntimeEnable || len(d.hash) != 0 +} + +// getLowerAt returns the dentry in the underlying file system, which is +// represented by filename relative to d. +func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) { + return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + Path: fspath.Parse(filename), + }, &vfs.GetDentryOptions{}) +} + +func (d *dentry) readlink(ctx context.Context) (string, error) { + return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }) +} + +// FileDescription implements vfs.FileDescriptionImpl for verity fds. +// FileDescription is a wrapper of the underlying lowerFD, with support to build +// Merkle trees through the Linux fs-verity API to verify contents read from +// lowerFD. +// +// +stateify savable +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + // d is the corresponding dentry to the fileDescription. + d *dentry + + // isDir specifies whehter the fileDescription points to a directory. + isDir bool + + // lowerFD is the FileDescription corresponding to the file in the + // underlying file system. + lowerFD *vfs.FileDescription + + // merkleReader is the read-only FileDescription corresponding to the + // Merkle tree file in the underlying file system. + merkleReader *vfs.FileDescription + + // merkleWriter is the FileDescription corresponding to the Merkle tree + // file in the underlying file system for writing. This should only be + // used when allowRuntimeEnable is set to true. + merkleWriter *vfs.FileDescription + + // parentMerkleWriter is the FileDescription of the Merkle tree for the + // directory that contains the current file/directory. This is only used + // if allowRuntimeEnable is set to true. + parentMerkleWriter *vfs.FileDescription + + // off is the file offset. off is protected by mu. + mu sync.Mutex `state:"nosave"` + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fileDescription) Release(ctx context.Context) { + fd.lowerFD.DecRef(ctx) + fd.merkleReader.DecRef(ctx) + if fd.merkleWriter != nil { + fd.merkleWriter.DecRef(ctx) + } + if fd.parentMerkleWriter != nil { + fd.parentMerkleWriter.DecRef(ctx) + } +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + // TODO(b/162788573): Add integrity check for metadata. + stat, err := fd.lowerFD.Stat(ctx, opts) + if err != nil { + return linux.Statx{}, err + } + if fd.d.verityEnabled() { + if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil { + return linux.Statx{}, err + } + } + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + // Verity files are read-only. + return syserror.EPERM +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + n := int64(0) + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + n = fd.off + case linux.SEEK_END: + n = int64(fd.d.size) + default: + return 0, syserror.EINVAL + } + if offset > math.MaxInt64-n { + return 0, syserror.EINVAL + } + offset += n + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// generateMerkle generates a Merkle tree file for fd. If fd points to a file +// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash +// of the generated Merkle tree and the data size is returned. If fd points to +// a regular file, the data is the content of the file. If fd points to a +// directory, the data is all hahes of its children, written to the Merkle tree +// file. +func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) { + fdReader := vfs.FileReadWriteSeeker{ + FD: fd.lowerFD, + Ctx: ctx, + } + merkleReader := vfs.FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + merkleWriter := vfs.FileReadWriteSeeker{ + FD: fd.merkleWriter, + Ctx: ctx, + } + params := &merkletree.GenerateParams{ + TreeReader: &merkleReader, + TreeWriter: &merkleWriter, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, + } + + switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT { + case linux.S_IFREG: + // For a regular file, generate a Merkle tree based on its + // content. + var err error + stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + + params.File = &fdReader + params.Size = int64(stat.Size) + params.Name = fd.d.name + params.Mode = uint32(stat.Mode) + params.UID = stat.UID + params.GID = stat.GID + params.DataAndTreeInSameFile = false + case linux.S_IFDIR: + // For a directory, generate a Merkle tree based on the hashes + // of its children that has already been written to the Merkle + // tree file. + merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + + params.Size = int64(merkleStat.Size) + + stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return nil, 0, err + } + + params.File = &merkleReader + params.Name = fd.d.name + params.Mode = uint32(stat.Mode) + params.UID = stat.UID + params.GID = stat.GID + params.DataAndTreeInSameFile = true + default: + // TODO(b/167728857): Investigate whether and how we should + // enable other types of file. + return nil, 0, syserror.EINVAL + } + hash, err := merkletree.Generate(params) + return hash, uint64(params.Size), err +} + +// enableVerity enables verity features on fd by generating a Merkle tree file +// and stores its hash in its parent directory's Merkle tree. +func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) { + if !fd.d.fs.allowRuntimeEnable { + return 0, syserror.EPERM + } + + fd.d.fs.verityMu.Lock() + defer fd.d.fs.verityMu.Unlock() + + // In allowRuntimeEnable mode, the underlying fd and read/write fd for + // the Merkle tree file should have all been initialized. For any file + // or directory other than the root, the parent Merkle tree file should + // have also been initialized. + if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) { + return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") + } + + hash, dataSize, err := fd.generateMerkle(ctx) + if err != nil { + return 0, err + } + + if fd.parentMerkleWriter != nil { + stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } + + // Write the hash of fd to the parent directory's Merkle tree + // file, as it should be part of the parent Merkle tree data. + // parentMerkleWriter is open with O_APPEND, so it should write + // directly to the end of the file. + if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil { + return 0, err + } + + // Record the offset of the hash of fd in parent directory's + // Merkle tree file. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleOffsetInParentXattr, + Value: strconv.Itoa(int(stat.Size)), + }); err != nil { + return 0, err + } + } + + // Record the size of the data being hashed for fd. + if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{ + Name: merkleSizeXattr, + Value: strconv.Itoa(int(dataSize)), + }); err != nil { + return 0, err + } + fd.d.hash = append(fd.d.hash, hash...) + return 0, nil +} + +// measureVerity returns the hash of fd, saved in verityDigest. +func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + return 0, syserror.EINVAL + } + var metadata linux.DigestMetadata + + // If allowRuntimeEnable is true, an empty fd.d.hash indicates that + // verity is not enabled for the file. If allowRuntimeEnable is false, + // this is an integrity violation because all files should have verity + // enabled, in which case fd.d.hash should be set. + if len(fd.d.hash) == 0 { + if fd.d.fs.allowRuntimeEnable { + return 0, syserror.ENODATA + } + return 0, alertIntegrityViolation("Ioctl measureVerity: no hash found") + } + + // The first part of VerityDigest is the metadata. + if _, err := metadata.CopyIn(t, verityDigest); err != nil { + return 0, err + } + if metadata.DigestSize < uint16(len(fd.d.hash)) { + return 0, syserror.EOVERFLOW + } + + // Populate the output digest size, since DigestSize is both input and + // output. + metadata.DigestSize = uint16(len(fd.d.hash)) + + // First copy the metadata. + if _, err := metadata.CopyOut(t, verityDigest); err != nil { + return 0, err + } + + // Now copy the root hash bytes to the memory after metadata. + _, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash) + return 0, err +} + +func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) { + f := int32(0) + + // All enabled files should store a hash. This flag is not settable via + // FS_IOC_SETFLAGS. + if len(fd.d.hash) != 0 { + f |= linux.FS_VERITY_FL + } + + t := kernel.TaskFromContext(ctx) + if t == nil { + return 0, syserror.EINVAL + } + _, err := primitive.CopyInt32Out(t, flags, f) + return 0, err +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FS_IOC_ENABLE_VERITY: + return fd.enableVerity(ctx, uio) + case linux.FS_IOC_MEASURE_VERITY: + return fd.measureVerity(ctx, uio, args[2].Pointer()) + case linux.FS_IOC_GETFLAGS: + return fd.verityFlags(ctx, uio, args[2].Pointer()) + default: + // TODO(b/169682228): Investigate which ioctl commands should + // be allowed. + return 0, syserror.ENOSYS + } +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Implement Read with PRead by setting offset. + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // No need to verify if the file is not enabled yet in + // allowRuntimeEnable mode. + if !fd.d.verityEnabled() { + return fd.lowerFD.PRead(ctx, dst, offset, opts) + } + + fd.d.fs.verityMu.RLock() + defer fd.d.fs.verityMu.RUnlock() + // dataSize is the size of the whole file. + dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{ + Name: merkleSizeXattr, + Size: sizeOfStringInt32, + }) + + // The Merkle tree file for the child should have been created and + // contains the expected xattrs. If the xattr does not exist, it + // indicates unexpected modifications to the file system. + if err == syserror.ENODATA { + return 0, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err)) + } + if err != nil { + return 0, err + } + + // The dataSize xattr should be an integer. If it's not, it indicates + // unexpected modifications to the file system. + size, err := strconv.Atoi(dataSize) + if err != nil { + return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err)) + } + + dataReader := vfs.FileReadWriteSeeker{ + FD: fd.lowerFD, + Ctx: ctx, + } + + merkleReader := vfs.FileReadWriteSeeker{ + FD: fd.merkleReader, + Ctx: ctx, + } + + n, err := merkletree.Verify(&merkletree.VerifyParams{ + Out: dst.Writer(ctx), + File: &dataReader, + Tree: &merkleReader, + Size: int64(size), + Name: fd.d.name, + Mode: fd.d.mode, + UID: fd.d.uid, + GID: fd.d.gid, + //TODO(b/156980949): Support passing other hash algorithms. + HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256, + ReadOffset: offset, + ReadSize: dst.NumBytes(), + Expected: fd.d.hash, + DataAndTreeInSameFile: false, + }) + if err != nil { + return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err)) + } + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EROFS +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.EROFS +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go new file mode 100644 index 000000000..c647cbfd3 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity_test.go @@ -0,0 +1,700 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "fmt" + "io" + "math/rand" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// rootMerkleFilename is the name of the root Merkle tree file. +const rootMerkleFilename = "root.verity" + +// maxDataSize is the maximum data size written to the file for test. +const maxDataSize = 100000 + +// newVerityRoot creates a new verity mount, and returns the root. The +// underlying file system is tmpfs. If the error is not nil, then cleanup +// should be called when the root is no longer needed. +func newVerityRoot(t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("testutil.Boot: %v", err) + } + + ctx := k.SupervisorContext() + + rand.Seed(time.Now().UnixNano()) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(ctx); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: InternalFilesystemOptions{ + RootMerkleFileName: rootMerkleFilename, + LowerName: "tmpfs", + AllowRuntimeEnable: true, + NoCrashOnVerificationFailure: true, + }, + }, + }) + if err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("NewMountNamespace: %v", err) + } + root := mntns.Root() + root.IncRef() + + // Use lowerRoot in the task as we modify the lower file system + // directly in many tests. + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := testutil.CreateTask(ctx, "name", tc, mntns, lowerRoot, lowerRoot) + if err != nil { + t.Fatalf("testutil.CreateTask: %v", err) + } + + t.Helper() + t.Cleanup(func() { + root.DecRef(ctx) + mntns.DecRef(ctx) + }) + return vfsObj, root, task, nil +} + +// newFileFD creates a new file in the verity mount, and returns the FD. The FD +// points to a file that has random data generated. +func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) { + creds := auth.CredentialsFromContext(ctx) + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + + // Create the file in the underlying file system. + lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(filePath), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + return nil, 0, err + } + + // Generate random data to be written to the file. + dataSize := rand.Intn(maxDataSize) + 1 + data := make([]byte, dataSize) + rand.Read(data) + + // Write directly to the underlying FD, since verity FD is read-only. + n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + return nil, 0, err + } + + if n != int64(len(data)) { + return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data)) + } + + lowerFD.DecRef(ctx) + + // Now open the verity file descriptor. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular | mode, + }) + return fd, dataSize, err +} + +// corruptRandomBit randomly flips a bit in the file represented by fd. +func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error { + // Flip a random bit in the underlying file. + randomPos := int64(rand.Intn(size)) + byteToModify := make([]byte, 1) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil { + return fmt.Errorf("lowerFD.PRead: %v", err) + } + byteToModify[0] ^= 1 + if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil { + return fmt.Errorf("lowerFD.PWrite: %v", err) + } + return nil +} + +// TestOpen ensures that when a file is created, the corresponding Merkle tree +// file and the root Merkle tree file exist. +func TestOpen(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Ensure that the corresponding Merkle tree file is created. + lowerRoot := root.Dentry().Impl().(*dentry).lowerVD + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(merklePrefix + filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }); err != nil { + t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err) + } + + // Ensure the root merkle tree file is created. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerRoot, + Start: lowerRoot, + Path: fspath.Parse(merklePrefix + rootMerkleFilename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }); err != nil { + t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err) + } +} + +// TestPReadUnmodifiedFileSucceeds ensures that pread from an untouched verity +// file succeeds after enabling verity for it. +func TestPReadUnmodifiedFileSucceeds(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirm a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + buf := make([]byte, size) + n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.PRead: %v", err) + } + + if n != int64(size) { + t.Errorf("fd.PRead got read length %d, want %d", n, size) + } +} + +// TestReadUnmodifiedFileSucceeds ensures that read from an untouched verity +// file succeeds after enabling verity for it. +func TestReadUnmodifiedFileSucceeds(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirm a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + buf := make([]byte, size) + n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.Read: %v", err) + } + + if n != int64(size) { + t.Errorf("fd.PRead got read length %d, want %d", n, size) + } +} + +// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file +// succeeds after enabling verity for it. +func TestReopenUnmodifiedFileSucceeds(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirms a normal read succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Ensure reopening the verity enabled file succeeds. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != nil { + t.Errorf("reopen enabled file failed: %v", err) + } +} + +// TestPReadModifiedFileFails ensures that read from a modified verity file +// fails. +func TestPReadModifiedFileFails(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerFD that's read/writable. + lowerVD := fd.Impl().(*fileDescription).d.lowerVD + + lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + if err := corruptRandomBit(ctx, lowerFD, size); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + // Confirm that read from the modified file fails. + buf := make([]byte, size) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { + t.Fatalf("fd.PRead succeeded, expected failure") + } +} + +// TestReadModifiedFileFails ensures that read from a modified verity file +// fails. +func TestReadModifiedFileFails(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerFD that's read/writable. + lowerVD := fd.Impl().(*fileDescription).d.lowerVD + + lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + if err := corruptRandomBit(ctx, lowerFD, size); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + // Confirm that read from the modified file fails. + buf := make([]byte, size) + if _, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}); err == nil { + t.Fatalf("fd.Read succeeded, expected failure") + } +} + +// TestModifiedMerkleFails ensures that read from a verity file fails if the +// corresponding Merkle tree file is modified. +func TestModifiedMerkleFails(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerMerkleFD that's read/writable. + lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD + + lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: lowerMerkleVD, + Start: lowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + // Flip a random bit in the Merkle tree file. + stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat: %v", err) + } + merkleSize := int(stat.Size) + if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + // Confirm that read from a file with modified Merkle tree fails. + buf := make([]byte, size) + if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil { + fmt.Println(buf) + t.Fatalf("fd.PRead succeeded with modified Merkle file") + } +} + +// TestModifiedParentMerkleFails ensures that open a verity enabled file in a +// verity enabled directory fails if the hashes related to the target file in +// the parent Merkle tree file is modified. +func TestModifiedParentMerkleFails(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Enable verity on the parent directory. + parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + // Open a new lowerMerkleFD that's read/writable. + parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD + + parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: parentLowerMerkleVD, + Start: parentLowerMerkleVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + t.Fatalf("OpenAt: %v", err) + } + + // Flip a random bit in the parent Merkle tree file. + // This parent directory contains only one child, so any random + // modification in the parent Merkle tree should cause verification + // failure when opening the child file. + stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat: %v", err) + } + parentMerkleSize := int(stat.Size) + if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil { + t.Fatalf("corruptRandomBit: %v", err) + } + + parentLowerMerkleFD.DecRef(ctx) + + // Ensure reopening the verity enabled file fails. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err == nil { + t.Errorf("OpenAt file with modified parent Merkle succeeded") + } +} + +// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file +// succeeds after enabling verity for it. +func TestUnmodifiedStatSucceeds(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file and confirms stat succeeds. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("fd.Ioctl: %v", err) + } + + if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil { + t.Errorf("fd.Stat: %v", err) + } +} + +// TestModifiedStatFails checks that getting stat for a file with modified stat +// should fail. +func TestModifiedStatFails(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("fd.Ioctl: %v", err) + } + + lowerFD := fd.Impl().(*fileDescription).lowerFD + // Change the stat of the underlying file, and check that stat fails. + if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: uint32(linux.STATX_MODE), + Mode: 0777, + }, + }); err != nil { + t.Fatalf("lowerFD.SetStat: %v", err) + } + + if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil { + t.Errorf("fd.Stat succeeded when it should fail") + } +} + +// TestOpenDeletedOrRenamedFileFails ensures that opening a deleted/renamed +// verity enabled file or the corresponding Merkle tree file fails with the +// verify error. +func TestOpenDeletedFileFails(t *testing.T) { + testCases := []struct { + // Tests removing files is remove is true. Otherwise tests + // renaming files. + remove bool + // The original file is removed/renamed if changeFile is true. + changeFile bool + // The Merkle tree file is removed/renamed if changeMerkleFile + // is true. + changeMerkleFile bool + }{ + { + remove: true, + changeFile: true, + changeMerkleFile: false, + }, + { + remove: true, + changeFile: false, + changeMerkleFile: true, + }, + { + remove: false, + changeFile: true, + changeMerkleFile: false, + }, + { + remove: false, + changeFile: true, + changeMerkleFile: false, + }, + } + for _, tc := range testCases { + t.Run(fmt.Sprintf("remove:%t", tc.remove), func(t *testing.T) { + vfsObj, root, ctx, err := newVerityRoot(t) + if err != nil { + t.Fatalf("newVerityRoot: %v", err) + } + + filename := "verity-test-file" + fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644) + if err != nil { + t.Fatalf("newFileFD: %v", err) + } + + // Enable verity on the file. + var args arch.SyscallArguments + args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY} + if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil { + t.Fatalf("Ioctl: %v", err) + } + + rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD + if tc.remove { + if tc.changeFile { + if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(filename), + }); err != nil { + t.Fatalf("UnlinkAt: %v", err) + } + } + if tc.changeMerkleFile { + if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(merklePrefix + filename), + }); err != nil { + t.Fatalf("UnlinkAt: %v", err) + } + } + } else { + newFilename := "renamed-test-file" + if tc.changeFile { + if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(filename), + }, &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(newFilename), + }, &vfs.RenameOptions{}); err != nil { + t.Fatalf("RenameAt: %v", err) + } + } + if tc.changeMerkleFile { + if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(merklePrefix + filename), + }, &vfs.PathOperation{ + Root: rootLowerVD, + Start: rootLowerVD, + Path: fspath.Parse(merklePrefix + newFilename), + }, &vfs.RenameOptions{}); err != nil { + t.Fatalf("UnlinkAt: %v", err) + } + } + } + + // Ensure reopening the verity enabled file fails. + if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + Mode: linux.ModeRegular, + }); err != syserror.EIO { + t.Errorf("got OpenAt error: %v, expected EIO", err) + } + }) + } +} diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index 61c78569d..300b7ccce 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -7,11 +7,14 @@ go_library( srcs = [ "cgroup.go", "hostmm.go", + "membarrier.go", ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/fd", "//pkg/log", "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/hostmm/membarrier.go b/pkg/sentry/hostmm/membarrier.go new file mode 100644 index 000000000..4468d75f1 --- /dev/null +++ b/pkg/sentry/hostmm/membarrier.go @@ -0,0 +1,90 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostmm + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" +) + +var ( + haveMembarrierGlobal = false + haveMembarrierPrivateExpedited = false +) + +func init() { + supported, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */) + if e != 0 { + if e != syscall.ENOSYS { + log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error()) + } + return + } + // We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to + // all CPUs running tasks that have previously invoked + // MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk. + // (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU + // grace period to elapse without bothering other CPUs. + // MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks + // sharing the caller's MM.) + if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 { + haveMembarrierGlobal = true + } + if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req { + if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 { + log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error()) + } else { + haveMembarrierPrivateExpedited = true + } + } +} + +// HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported. +func HaveGlobalMemoryBarrier() bool { + return haveMembarrierGlobal +} + +// GlobalMemoryBarrier blocks until "all running threads [in the host OS] have +// passed through a state where all memory accesses to user-space addresses +// match program order between entry to and return from [GlobalMemoryBarrier]", +// as for membarrier(2). +// +// Preconditions: HaveGlobalMemoryBarrier() == true. +func GlobalMemoryBarrier() error { + if _, _, e := syscall.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 { + return e + } + return nil +} + +// HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported. +func HaveProcessMemoryBarrier() bool { + return haveMembarrierPrivateExpedited +} + +// ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only +// synchronizes with threads sharing a virtual address space (from the host OS' +// perspective) with the calling thread. +// +// Preconditions: HaveProcessMemoryBarrier() == true. +func ProcessMemoryBarrier() error { + if _, _, e := syscall.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 { + return e + } + return nil +} diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 07bf39fed..5bba9de0b 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -15,6 +15,7 @@ go_library( ], deps = [ "//pkg/context", + "//pkg/tcpip", "//pkg/tcpip/stack", ], ) diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 2916a0644..f31277d30 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -15,7 +15,10 @@ // Package inet defines semantics for IP stacks. package inet -import "gvisor.dev/gvisor/pkg/tcpip/stack" +import ( + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) // Stack represents a TCP/IP stack. type Stack interface { @@ -29,9 +32,13 @@ type Stack interface { InterfaceAddrs() map[int32][]InterfaceAddr // AddInterfaceAddr adds an address to the network interface identified by - // index. + // idx. AddInterfaceAddr(idx int32, addr InterfaceAddr) error + // RemoveInterfaceAddr removes an address from the network interface + // identified by idx. + RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error + // SupportsIPv6 returns true if the stack supports IPv6 connectivity. SupportsIPv6() bool @@ -56,6 +63,12 @@ type Stack interface { // settings. SetTCPSACKEnabled(enabled bool) error + // TCPRecovery returns the TCP loss detection algorithm. + TCPRecovery() (TCPLossRecovery, error) + + // SetTCPRecovery attempts to change TCP loss detection algorithm. + SetTCPRecovery(recovery TCPLossRecovery) error + // Statistics reports stack statistics. Statistics(stat interface{}, arg string) error @@ -74,6 +87,12 @@ type Stack interface { // RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful // for restoring a stack after a save. RestoreCleanupEndpoints([]stack.TransportEndpoint) + + // Forwarding returns if packet forwarding between NICs is enabled. + Forwarding(protocol tcpip.NetworkProtocolNumber) bool + + // SetForwarding enables or disables packet forwarding between NICs. + SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error } // Interface contains information about a network interface. @@ -189,3 +208,14 @@ type StatSNMPUDP [8]uint64 // StatSNMPUDPLite describes UdpLite line of /proc/net/snmp. type StatSNMPUDPLite [8]uint64 + +// TCPLossRecovery indicates TCP loss detection and recovery methods to use. +type TCPLossRecovery int32 + +// Loss recovery constants from include/net/tcp.h which are used to set +// /proc/sys/net/ipv4/tcp_recovery. +const ( + TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota + TCP_RACK_STATIC_REO_WND + TCP_RACK_NO_DUPTHRESH +) diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index d8961fc94..9ebeba8a3 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -14,7 +14,13 @@ package inet -import "gvisor.dev/gvisor/pkg/tcpip/stack" +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/stack" +) // TestStack is a dummy implementation of Stack for tests. type TestStack struct { @@ -25,6 +31,8 @@ type TestStack struct { TCPRecvBufSize TCPBufferSize TCPSendBufSize TCPBufferSize TCPSACKFlag bool + Recovery TCPLossRecovery + IPForwarding bool } // NewTestStack returns a TestStack with no network interfaces. The value of @@ -53,6 +61,24 @@ func (s *TestStack) AddInterfaceAddr(idx int32, addr InterfaceAddr) error { return nil } +// RemoveInterfaceAddr implements Stack.RemoveInterfaceAddr. +func (s *TestStack) RemoveInterfaceAddr(idx int32, addr InterfaceAddr) error { + interfaceAddrs, ok := s.InterfaceAddrsMap[idx] + if !ok { + return fmt.Errorf("unknown idx: %d", idx) + } + + var filteredAddrs []InterfaceAddr + for _, interfaceAddr := range interfaceAddrs { + if !bytes.Equal(interfaceAddr.Addr, addr.Addr) { + filteredAddrs = append(filteredAddrs, addr) + } + } + s.InterfaceAddrsMap[idx] = filteredAddrs + + return nil +} + // SupportsIPv6 implements Stack.SupportsIPv6. func (s *TestStack) SupportsIPv6() bool { return s.SupportsIPv6Flag @@ -91,6 +117,17 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { return nil } +// TCPRecovery implements Stack.TCPRecovery. +func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) { + return s.Recovery, nil +} + +// SetTCPRecovery implements Stack.SetTCPRecovery. +func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error { + s.Recovery = recovery + return nil +} + // Statistics implements inet.Stack.Statistics. func (s *TestStack) Statistics(stat interface{}, arg string) error { return nil @@ -116,3 +153,14 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint { // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} + +// Forwarding implements inet.Stack.Forwarding. +func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { + return s.IPForwarding +} + +// SetForwarding implements inet.Stack.SetForwarding. +func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { + s.IPForwarding = enable + return nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a28eab8b8..90dd4a047 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -69,8 +69,63 @@ go_template_instance( prefix = "socket", template = "//pkg/ilist:generic_list", types = { - "Element": "*SocketEntry", - "Linker": "*SocketEntry", + "Element": "*SocketRecordVFS1", + "Linker": "*SocketRecordVFS1", + }, +) + +go_template_instance( + name = "fd_table_refs", + out = "fd_table_refs.go", + package = "kernel", + prefix = "FDTable", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "FDTable", + }, +) + +go_template_instance( + name = "fs_context_refs", + out = "fs_context_refs.go", + package = "kernel", + prefix = "FSContext", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "FSContext", + }, +) + +go_template_instance( + name = "ipc_namespace_refs", + out = "ipc_namespace_refs.go", + package = "kernel", + prefix = "IPCNamespace", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "IPCNamespace", + }, +) + +go_template_instance( + name = "process_group_refs", + out = "process_group_refs.go", + package = "kernel", + prefix = "ProcessGroup", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "ProcessGroup", + }, +) + +go_template_instance( + name = "session_refs", + out = "session_refs.go", + package = "kernel", + prefix = "Session", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "Session", }, ) @@ -85,11 +140,17 @@ go_library( name = "kernel", srcs = [ "abstract_socket_namespace.go", + "aio.go", "context.go", "fd_table.go", + "fd_table_refs.go", "fd_table_unsafe.go", "fs_context.go", + "fs_context_refs.go", "ipc_namespace.go", + "ipc_namespace_refs.go", + "kcov.go", + "kcov_unsafe.go", "kernel.go", "kernel_opts.go", "kernel_state.go", @@ -98,6 +159,7 @@ go_library( "pending_signals_state.go", "posixtimer.go", "process_group_list.go", + "process_group_refs.go", "ptrace.go", "ptrace_amd64.go", "ptrace_arm64.go", @@ -105,6 +167,7 @@ go_library( "seccomp.go", "seqatomic_taskgoroutineschedinfo_unsafe.go", "session_list.go", + "session_refs.go", "sessions.go", "signal.go", "signal_handlers.go", @@ -131,6 +194,7 @@ go_library( "task_stop.go", "task_syscall.go", "task_usermem.go", + "task_work.go", "thread_group.go", "threads.go", "timekeeper.go", @@ -145,22 +209,27 @@ go_library( "gvisor.dev/gvisor/pkg/sentry/device", "gvisor.dev/gvisor/pkg/tcpip", ], + marshal = True, visibility = ["//:sandbox"], deps = [ ":uncaught_signal_go_proto", "//pkg/abi", "//pkg/abi/linux", "//pkg/amutex", - "//pkg/binary", "//pkg/bits", "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", + "//pkg/coverage", "//pkg/cpuid", "//pkg/eventchannel", "//pkg/fspath", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/metric", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", @@ -199,6 +268,7 @@ go_library( "//pkg/sentry/vfs", "//pkg/state", "//pkg/state/statefile", + "//pkg/state/wire", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", @@ -206,7 +276,6 @@ go_library( "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", - "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 920fe4329..0ddbe5ff6 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -15,28 +15,21 @@ package kernel import ( + "fmt" "syscall" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable type abstractEndpoint struct { - ep transport.BoundEndpoint - wr *refs.WeakRef - name string - ns *AbstractSocketNamespace -} - -// WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone() { - e.ns.mu.Lock() - if e.ns.endpoints[e.name].ep == e.ep { - delete(e.ns.endpoints, e.name) - } - e.ns.mu.Unlock() + ep transport.BoundEndpoint + socket refsvfs2.RefCounter + name string + ns *AbstractSocketNamespace } // AbstractSocketNamespace is used to implement the Linux abstract socket functionality. @@ -45,7 +38,11 @@ func (e *abstractEndpoint) WeakRefGone() { type AbstractSocketNamespace struct { mu sync.Mutex `state:"nosave"` - // Keeps mapping from name to endpoint. + // Keeps a mapping from name to endpoint. AbstractSocketNamespace does not hold + // any references on any sockets that it contains; when retrieving a socket, + // TryIncRef() must be called in case the socket is concurrently being + // destroyed. It is the responsibility of the socket to remove itself from the + // abstract socket namespace when it is destroyed. endpoints map[string]abstractEndpoint } @@ -57,16 +54,16 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { } // A boundEndpoint wraps a transport.BoundEndpoint to maintain a reference on -// its backing object. +// its backing socket. type boundEndpoint struct { transport.BoundEndpoint - rc refs.RefCounter + socket refsvfs2.RefCounter } // Release implements transport.BoundEndpoint.Release. -func (e *boundEndpoint) Release() { - e.rc.DecRef() - e.BoundEndpoint.Release() +func (e *boundEndpoint) Release(ctx context.Context) { + e.socket.DecRef(ctx) + e.BoundEndpoint.Release(ctx) } // BoundEndpoint retrieves the endpoint bound to the given name. The return @@ -80,32 +77,59 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp return nil } - rc := ep.wr.Get() - if rc == nil { - delete(a.endpoints, name) + if !ep.socket.TryIncRef() { + // The socket has reached zero references and is being destroyed. return nil } - return &boundEndpoint{ep.ep, rc} + return &boundEndpoint{ep.ep, ep.socket} } // Bind binds the given socket. // -// When the last reference managed by rc is dropped, ep may be removed from the +// When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() + // Check if there is already a socket (which has not yet been destroyed) bound at name. if ep, ok := a.endpoints[name]; ok { - if rc := ep.wr.Get(); rc != nil { - rc.DecRef() + if ep.socket.TryIncRef() { + ep.socket.DecRef(ctx) return syscall.EADDRINUSE } } ae := abstractEndpoint{ep: ep, name: name, ns: a} - ae.wr = refs.NewWeakRef(rc, &ae) + ae.socket = socket a.endpoints[name] = ae return nil } + +// Remove removes the specified socket at name from the abstract socket +// namespace, if it has not yet been replaced. +func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) { + a.mu.Lock() + defer a.mu.Unlock() + + ep, ok := a.endpoints[name] + if !ok { + // We never delete a map entry apart from a socket's destructor (although the + // map entry may be overwritten). Therefore, a socket should exist, even if it + // may not be the one we expect. + panic(fmt.Sprintf("expected socket to exist at '%s' in abstract socket namespace", name)) + } + + // A Bind() operation may race with callers of Remove(), e.g. in the + // following case: + // socket1 reaches zero references and begins destruction + // a.Bind("foo", ep, socket2) replaces socket1 with socket2 + // socket1's destructor calls a.Remove("foo", socket1) + // + // Therefore, we need to check that the socket at name is what we expect + // before modifying the map. + if ep.socket == socket { + delete(a.endpoints, name) + } +} diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go new file mode 100644 index 000000000..0ac78c0b8 --- /dev/null +++ b/pkg/sentry/kernel/aio.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" +) + +// AIOCallback is an function that does asynchronous I/O on behalf of a task. +type AIOCallback func(context.Context) + +// QueueAIO queues an AIOCallback which will be run asynchronously. +func (t *Task) QueueAIO(cb AIOCallback) { + ctx := taskAsyncContext{t: t} + wg := &t.TaskSet().aioGoroutines + wg.Add(1) + go func() { + cb(ctx) + wg.Done() + }() +} + +type taskAsyncContext struct { + context.NoopSleeper + t *Task +} + +// Debugf implements log.Logger.Debugf. +func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { + ctx.t.Debugf(format, v...) +} + +// Infof implements log.Logger.Infof. +func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { + ctx.t.Infof(format, v...) +} + +// Warningf implements log.Logger.Warningf. +func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { + ctx.t.Warningf(format, v...) +} + +// IsLogging implements log.Logger.IsLogging. +func (ctx taskAsyncContext) IsLogging(level log.Level) bool { + return ctx.t.IsLogging(level) +} + +// Deadline implements context.Context.Deadline. +func (ctx taskAsyncContext) Deadline() (time.Time, bool) { + return ctx.t.Deadline() +} + +// Done implements context.Context.Done. +func (ctx taskAsyncContext) Done() <-chan struct{} { + return ctx.t.Done() +} + +// Err implements context.Context.Err. +func (ctx taskAsyncContext) Err() error { + return ctx.t.Err() +} + +// Value implements context.Context.Value. +func (ctx taskAsyncContext) Value(key interface{}) interface{} { + return ctx.t.Value(key) +} diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 2bc49483a..869e49ebc 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -57,6 +57,7 @@ go_library( "id_map_set.go", "user_namespace.go", ], + marshal = True, visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go index ef5723127..c08d47787 100644 --- a/pkg/sentry/kernel/auth/context.go +++ b/pkg/sentry/kernel/auth/context.go @@ -34,3 +34,23 @@ func CredentialsFromContext(ctx context.Context) *Credentials { } return NewAnonymousCredentials() } + +// ContextWithCredentials returns a copy of ctx carrying creds. +func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context { + return &authContext{ctx, creds} +} + +type authContext struct { + context.Context + creds *Credentials +} + +// Value implements context.Context. +func (ac *authContext) Value(key interface{}) interface{} { + switch key { + case CtxCredentials: + return ac.creds + default: + return ac.Context.Value(key) + } +} diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index e057d2c6d..6862f2ef5 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { } return NoID, syserror.EPERM } + +// SetUID translates the provided uid to the root user namespace and updates c's +// uids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetUID(uid UID) error { + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + c.RealKUID = kuid + c.EffectiveKUID = kuid + c.SavedKUID = kuid + return nil +} + +// SetGID translates the provided gid to the root user namespace and updates c's +// gids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetGID(gid GID) error { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + c.RealKGID = kgid + c.EffectiveKGID = kgid + c.SavedKGID = kgid + return nil +} diff --git a/pkg/sentry/kernel/auth/id.go b/pkg/sentry/kernel/auth/id.go index 0a58ba17c..4c32ee703 100644 --- a/pkg/sentry/kernel/auth/id.go +++ b/pkg/sentry/kernel/auth/id.go @@ -19,9 +19,13 @@ import ( ) // UID is a user ID in an unspecified user namespace. +// +// +marshal type UID uint32 // GID is a group ID in an unspecified user namespace. +// +// +marshal slice:GIDSlice type GID uint32 // In the root user namespace, user/group IDs have a 1-to-1 relationship with diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index 0c40bf315..bb94769c4 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -18,7 +18,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/log" ) // contextID is the kernel package's type for context.Context.Value keys. @@ -82,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { } // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, -// or nil if there is no such IPC namespace. +// or nil if there is no such IPC namespace. It takes a reference on the +// namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { if v := ctx.Value(CtxIPCNamespace); v != nil { return v.(*IPCNamespace) @@ -113,55 +113,3 @@ func (*Task) Done() <-chan struct{} { func (*Task) Err() error { return nil } - -// AsyncContext returns a context.Context that may be used by goroutines that -// do work on behalf of t and therefore share its contextual values, but are -// not t's task goroutine (e.g. asynchronous I/O). -func (t *Task) AsyncContext() context.Context { - return taskAsyncContext{t: t} -} - -type taskAsyncContext struct { - context.NoopSleeper - t *Task -} - -// Debugf implements log.Logger.Debugf. -func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) { - ctx.t.Debugf(format, v...) -} - -// Infof implements log.Logger.Infof. -func (ctx taskAsyncContext) Infof(format string, v ...interface{}) { - ctx.t.Infof(format, v...) -} - -// Warningf implements log.Logger.Warningf. -func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) { - ctx.t.Warningf(format, v...) -} - -// IsLogging implements log.Logger.IsLogging. -func (ctx taskAsyncContext) IsLogging(level log.Level) bool { - return ctx.t.IsLogging(level) -} - -// Deadline implements context.Context.Deadline. -func (ctx taskAsyncContext) Deadline() (time.Time, bool) { - return ctx.t.Deadline() -} - -// Done implements context.Context.Done. -func (ctx taskAsyncContext) Done() <-chan struct{} { - return ctx.t.Done() -} - -// Err implements context.Context.Err. -func (ctx taskAsyncContext) Err() error { - return ctx.t.Err() -} - -// Value implements context.Context.Value. -func (ctx taskAsyncContext) Value(key interface{}) interface{} { - return ctx.t.Value(key) -} diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 3d78cd48f..15519f0df 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -76,8 +76,8 @@ type pollEntry struct { // WeakRefGone implements refs.WeakRefUser.WeakRefGone. // weakReferenceGone is called when the file in the weak reference is destroyed. // The poll entry is removed in response to this. -func (p *pollEntry) WeakRefGone() { - p.epoll.RemoveEntry(p.id) +func (p *pollEntry) WeakRefGone(ctx context.Context) { + p.epoll.RemoveEntry(ctx, p.id) } // EventPoll holds all the state associated with an event poll object, that is, @@ -107,7 +107,7 @@ type EventPoll struct { // different lock to avoid circular lock acquisition order involving // the wait queue mutexes and mu. The full order is mu, observed file // wait queue mutex, then listsMu; this allows listsMu to be acquired - // when readyCallback is called. + // when (*pollEntry).Callback is called. // // An entry is always in one of the following lists: // readyList -- when there's a chance that it's ready to have @@ -116,7 +116,7 @@ type EventPoll struct { // readEvents() functions always call the entry's file // Readiness() function to confirm it's ready. // waitingList -- when there's no chance that the entry is ready, - // so it's waiting for the readyCallback to be called + // so it's waiting for the (*pollEntry).Callback to be called // on it before it gets moved to the readyList. // disabledList -- when the entry is disabled. This happens when // a one-shot entry gets delivered via readEvents(). @@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File { // name matches fs/eventpoll.c:epoll_create1. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ files: make(map[FileIdentifier]*pollEntry), }) } // Release implements fs.FileOperations.Release. -func (e *EventPoll) Release() { +func (e *EventPoll) Release(ctx context.Context) { // We need to take the lock now because files may be attempting to // remove entries in parallel if they get destroyed. e.mu.Lock() @@ -160,7 +160,7 @@ func (e *EventPoll) Release() { // Go through all entries and clean up. for _, entry := range e.files { entry.id.File.EventUnregister(&entry.waiter) - entry.file.Drop() + entry.file.Drop(ctx) } e.files = nil } @@ -269,21 +269,19 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent { return ret } -// readyCallback is called when one of the files we're polling becomes ready. It -// moves said file to the readyList if it's currently in the waiting list. -type readyCallback struct{} - // Callback implements waiter.EntryCallback.Callback. -func (*readyCallback) Callback(w *waiter.Entry) { - entry := w.Context.(*pollEntry) - e := entry.epoll +// +// Callback is called when one of the files we're polling becomes ready. It +// moves said file to the readyList if it's currently in the waiting list. +func (p *pollEntry) Callback(*waiter.Entry) { + e := p.epoll e.listsMu.Lock() - if entry.curList == &e.waitingList { - e.waitingList.Remove(entry) - e.readyList.PushBack(entry) - entry.curList = &e.readyList + if p.curList == &e.waitingList { + e.waitingList.Remove(p) + e.readyList.PushBack(p) + p.curList = &e.readyList e.listsMu.Unlock() e.Notify(waiter.EventIn) @@ -310,7 +308,7 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) { // Check if the file happens to already be in a ready state. ready := f.Readiness(entry.mask) & entry.mask if ready != 0 { - (*readyCallback).Callback(nil, &entry.waiter) + entry.Callback(&entry.waiter) } } @@ -380,10 +378,9 @@ func (e *EventPoll) AddEntry(id FileIdentifier, flags EntryFlags, mask waiter.Ev userData: data, epoll: e, flags: flags, - waiter: waiter.Entry{Callback: &readyCallback{}}, mask: mask, } - entry.waiter.Context = entry + entry.waiter.Callback = entry e.files[id] = entry entry.file = refs.NewWeakRef(id.File, entry) @@ -406,7 +403,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // Unregister the old mask and remove entry from the list it's in, so - // readyCallback is guaranteed to not be called on this entry anymore. + // (*pollEntry).Callback is guaranteed to not be called on this entry anymore. entry.id.File.EventUnregister(&entry.waiter) // Remove entry from whatever list it's in. This ensure that no other @@ -426,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // RemoveEntry a files from the collection of observed files. -func (e *EventPoll) RemoveEntry(id FileIdentifier) error { +func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { e.mu.Lock() defer e.mu.Unlock() @@ -448,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error { // Remove file from map, and drop weak reference. delete(e.files, id) - entry.file.Drop() + entry.file.Drop(ctx) return nil } diff --git a/pkg/sentry/kernel/epoll/epoll_state.go b/pkg/sentry/kernel/epoll/epoll_state.go index 8e9f200d0..7c61e0258 100644 --- a/pkg/sentry/kernel/epoll/epoll_state.go +++ b/pkg/sentry/kernel/epoll/epoll_state.go @@ -21,8 +21,7 @@ import ( // afterLoad is invoked by stateify. func (p *pollEntry) afterLoad() { - p.waiter = waiter.Entry{Callback: &readyCallback{}} - p.waiter.Context = p + p.waiter.Callback = p p.file = refs.NewWeakRef(p.id.File, p) p.id.File.EventRegister(&p.waiter, p.mask) } diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 22630e9c5..55b505593 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) { f := filetest.NewTestFile(t) id := FileIdentifier{f, 12} - efile := NewEventPoll(contexttest.Context(t)) + ctx := contexttest.Context(t) + efile := NewEventPoll(ctx) e := efile.FileOperations.(*EventPoll) if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { t.Fatalf("addEntry failed: %v", err) @@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) { } // Destroy the file. Check that we get no more events. - f.DecRef() + f.DecRef(ctx) evt = e.ReadEvents(1) if len(evt) != 0 { diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 87951adeb..bbf568dfc 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { // name matches fs/eventfd.c:eventfd_file_create. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ val: initVal, semMode: semMode, @@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) { } // Release implements fs.FileOperations.Release. -func (e *EventOperations) Release() { +func (e *EventOperations) Release(context.Context) { e.mu.Lock() defer e.mu.Unlock() if e.hostfd >= 0 { diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index b9126e946..2b3955598 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -11,6 +11,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index d32c3e90a..153d2cd9b 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -20,15 +20,21 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) -// New creates a new FileAsync. +// New creates a new fs.FileAsync. func New() fs.FileAsync { return &FileAsync{} } +// NewVFS2 creates a new vfs.FileAsync. +func NewVFS2() vfs.FileAsync { + return &FileAsync{} +} + // FileAsync sends signals when the registered file is ready for IO. // // +stateify savable @@ -170,3 +176,13 @@ func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kern a.recipientTG = nil a.recipientPG = recipient } + +// ClearOwner unsets the current signal recipient. +func (a *FileAsync) ClearOwner() { + a.mu.Lock() + defer a.mu.Unlock() + a.requester = nil + a.recipientT = nil + a.recipientTG = nil + a.recipientPG = nil +} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index dbfcef0fa..7aba31587 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -23,12 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // FDFlags define flags for an individual descriptor. @@ -77,11 +77,9 @@ type descriptor struct { // // +stateify savable type FDTable struct { - refs.AtomicRefCount - k *Kernel + FDTableRefs - // uid is a unique identifier. - uid uint64 + k *Kernel // mu protects below. mu sync.Mutex `state:"nosave"` @@ -100,7 +98,7 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, fileVFS2: fileVFS2, @@ -111,26 +109,30 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { } func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { - f.init() // Initialize table. + ctx := context.Background() + f.initNoLeakCheck() // Initialize table. + f.used = 0 for fd, d := range m { - f.setAll(fd, d.file, d.fileVFS2, d.flags) + if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil { + panic("VFS1 or VFS2 files set") + } // Note that we do _not_ need to acquire a extra table reference here. The // table reference will already be accounted for in the file, so we drop the // reference taken by set above. switch { case d.file != nil: - d.file.DecRef() + d.file.DecRef(ctx) case d.fileVFS2 != nil: - d.fileVFS2.DecRef() + d.fileVFS2.DecRef(ctx) } } } // drop drops the table reference. -func (f *FDTable) drop(file *fs.File) { +func (f *FDTable) drop(ctx context.Context, file *fs.File) { // Release locks. - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF}) // Send inotify events. d := file.Dirent @@ -146,61 +148,51 @@ func (f *FDTable) drop(file *fs.File) { d.InotifyEvent(ev, 0) // Drop the table reference. - file.DecRef() + file.DecRef(ctx) } // dropVFS2 drops the table reference. -func (f *FDTable) dropVFS2(file *vfs.FileDescription) { - // TODO(gvisor.dev/issue/1480): Release locks. +func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { + // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the + // entire file. + err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) + if err != nil && err != syserror.ENOLCK { + panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + } // Generate inotify events. ev := uint32(linux.IN_CLOSE_NOWRITE) if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) - // Drop the table reference. - file.DecRef() -} - -// ID returns a unique identifier for this FDTable. -func (f *FDTable) ID() uint64 { - return f.uid + // Drop the table's reference. + file.DecRef(ctx) } // NewFDTable allocates a new FDTable that may be used by tasks in k. func (k *Kernel) NewFDTable() *FDTable { - f := &FDTable{ - k: k, - uid: atomic.AddUint64(&k.fdMapUids, 1), - } + f := &FDTable{k: k} f.init() return f } -// destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { - return true +// DecRef implements RefCounter.DecRef. +// +// If f reaches zero references, all of its file descriptors are removed. +func (f *FDTable) DecRef(ctx context.Context) { + f.FDTableRefs.DecRef(func() { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { + return true + }) }) } -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDTable) DecRef() { - f.DecRefWithDestructor(f.destroy) -} - -// Size returns the number of file descriptor slots currently allocated. -func (f *FDTable) Size() int { - size := atomic.LoadInt32(&f.used) - return int(size) -} - // forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { +func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 fd := int32(0) @@ -219,7 +211,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, file, nil, flags) - file.DecRef() + file.DecRef(ctx) case fileVFS2 != nil: if !fileVFS2.TryIncRef() { retries++ @@ -229,7 +221,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, nil, fileVFS2, flags) - fileVFS2.DecRef() + fileVFS2.DecRef(ctx) } retries = 0 fd++ @@ -239,7 +231,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes // String is a stringer for FDTable. func (f *FDTable) String() string { var buf strings.Builder - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + ctx := context.Background() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: n, _ := file.Dirent.FullName(nil /* root */) @@ -247,7 +240,11 @@ func (f *FDTable) String() string { case fileVFS2 != nil: vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() - name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + vd := fileVFS2.VirtualDentry() + if vd.Dentry() == nil { + panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2)) + } + name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "<err: %v>\n", err) return @@ -282,7 +279,6 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags } f.mu.Lock() - defer f.mu.Unlock() // From f.next to find available fd. if fd < f.next { @@ -292,15 +288,25 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags // Install all entries. for i := fd; i < end && len(fds) < len(files); i++ { if d, _, _ := f.get(i); d == nil { - f.set(i, files[len(fds)], flags) // Set the descriptor. - fds = append(fds, i) // Record the file descriptor. + // Set the descriptor. + f.set(ctx, i, files[len(fds)], flags) + fds = append(fds, i) // Record the file descriptor. } } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { - f.set(i, nil, FDFlags{}) // Zap entry. + f.set(ctx, i, nil, FDFlags{}) + } + f.mu.Unlock() + + // Drop the reference taken by the call to f.set() that + // originally installed the file. Don't call f.drop() + // (generating inotify events, etc.) since the file should + // appear to have never been inserted into f. + for _, file := range files[:len(fds)] { + file.DecRef(ctx) } return nil, syscall.EMFILE } @@ -310,6 +316,7 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags f.next = fds[len(fds)-1] + 1 } + f.mu.Unlock() return fds, nil } @@ -337,7 +344,6 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes } f.mu.Lock() - defer f.mu.Unlock() // From f.next to find available fd. if fd < f.next { @@ -347,15 +353,25 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes // Install all entries. for i := fd; i < end && len(fds) < len(files); i++ { if d, _, _ := f.getVFS2(i); d == nil { - f.setVFS2(i, files[len(fds)], flags) // Set the descriptor. - fds = append(fds, i) // Record the file descriptor. + // Set the descriptor. + f.setVFS2(ctx, i, files[len(fds)], flags) + fds = append(fds, i) // Record the file descriptor. } } // Failure? Unwind existing FDs. if len(fds) < len(files) { for _, i := range fds { - f.setVFS2(i, nil, FDFlags{}) // Zap entry. + f.setVFS2(ctx, i, nil, FDFlags{}) + } + f.mu.Unlock() + + // Drop the reference taken by the call to f.setVFS2() that + // originally installed the file. Don't call f.dropVFS2() + // (generating inotify events, etc.) since the file should + // appear to have never been inserted into f. + for _, file := range files[:len(fds)] { + file.DecRef(ctx) } return nil, syscall.EMFILE } @@ -365,6 +381,7 @@ func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDes f.next = fds[len(fds)-1] + 1 } + f.mu.Unlock() return fds, nil } @@ -400,7 +417,7 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc } for fd < end { if d, _, _ := f.getVFS2(fd); d == nil { - f.setVFS2(fd, file, flags) + f.setVFS2(ctx, fd, file, flags) if fd == f.next { // Update next search start position. f.next = fd + 1 @@ -416,40 +433,55 @@ func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDesc // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { - return f.newFDAt(ctx, fd, file, nil, flags) + df, _, err := f.newFDAt(ctx, fd, file, nil, flags) + if err != nil { + return err + } + if df != nil { + f.drop(ctx, df) + } + return nil } // NewFDAtVFS2 sets the file reference for the given FD. If there is an active // reference for that FD, the ref count for that existing reference is // decremented. func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { - return f.newFDAt(ctx, fd, nil, file, flags) + _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags) + if err != nil { + return err + } + if dfVFS2 != nil { + f.dropVFS2(ctx, dfVFS2) + } + return nil } -func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error { +func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) { if fd < 0 { // Don't accept negative FDs. - return syscall.EBADF + return nil, nil, syscall.EBADF } // Check the limit for the provided file. if limitSet := limits.FromContext(ctx); limitSet != nil { if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { - return syscall.EMFILE + return nil, nil, syscall.EMFILE } } // Install the entry. f.mu.Lock() defer f.mu.Unlock() - f.setAll(fd, file, fileVFS2, flags) - return nil + + df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags) + return df, dfVFS2, nil } // SetFlags sets the flags for the given file descriptor. // // True is returned iff flags were changed. -func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { +func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error { if fd < 0 { // Don't accept negative FDs. return syscall.EBADF @@ -465,7 +497,30 @@ func (f *FDTable) SetFlags(fd int32, flags FDFlags) error { } // Update the flags. - f.set(fd, file, flags) + f.set(ctx, fd, file, flags) + return nil +} + +// SetFlagsVFS2 sets the flags for the given file descriptor. +// +// True is returned iff flags were changed. +func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error { + if fd < 0 { + // Don't accept negative FDs. + return syscall.EBADF + } + + f.mu.Lock() + defer f.mu.Unlock() + + file, _, _ := f.getVFS2(fd) + if file == nil { + // No file found. + return syscall.EBADF + } + + // Update the flags. + f.setVFS2(ctx, fd, file, flags) return nil } @@ -523,50 +578,23 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. -func (f *FDTable) GetFDs() []int32 { +func (f *FDTable) GetFDs(ctx context.Context) []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds } -// GetRefs returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDTable) GetRefs() []*fs.File { - files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - file.IncRef() // Acquire a reference for caller. - files = append(files, file) - }) - return files -} - -// GetRefsVFS2 returns a stable slice of references to all files and bumps the -// reference count on each. The caller must use DecRef on each reference when -// they're done using the slice. -func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { - files := make([]*vfs.FileDescription, 0, f.Size()) - f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { - file.IncRef() // Acquire a reference for caller. - files = append(files, file) - }) - return files -} - // Fork returns an independent FDTable. -func (f *FDTable) Fork() *FDTable { +func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. - switch { - case file != nil: - clone.set(fd, file, flags) - case fileVFS2 != nil: - clone.setVFS2(fd, fileVFS2, flags) + if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil { + panic("VFS1 or VFS2 files set") } }) return clone @@ -575,13 +603,12 @@ func (f *FDTable) Fork() *FDTable { // Remove removes an FD from and returns a non-file iff successful. // // N.B. Callers are required to use DecRef when they are done. -func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { +func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) { if fd < 0 { return nil, nil } f.mu.Lock() - defer f.mu.Unlock() // Update current available position. if fd < f.next { @@ -597,24 +624,51 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { case orig2 != nil: orig2.IncRef() } + if orig != nil || orig2 != nil { - f.setAll(fd, nil, nil, FDFlags{}) // Zap entry. + orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry. } + f.mu.Unlock() + + if orig != nil { + f.drop(ctx, orig) + } + if orig2 != nil { + f.dropVFS2(ctx, orig2) + } + return orig, orig2 } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { - f.mu.Lock() - defer f.mu.Unlock() +func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { + // TODO(gvisor.dev/issue/1624): Remove fs.File slice. + var files []*fs.File + var filesVFS2 []*vfs.FileDescription - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.mu.Lock() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { - f.set(fd, nil, FDFlags{}) // Clear from table. + df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table. + if df != nil { + files = append(files, df) + } + if dfVFS2 != nil { + filesVFS2 = append(filesVFS2, dfVFS2) + } // Update current available position. if fd < f.next { f.next = fd } } }) + f.mu.Unlock() + + for _, file := range files { + f.drop(ctx, file) + } + + for _, file := range filesVFS2 { + f.dropVFS2(ctx, file) + } } diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 29f95a2c4..bf5460083 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -72,7 +72,7 @@ func TestFDTableMany(t *testing.T) { } i := int32(2) - fdTable.Remove(i) + fdTable.Remove(ctx, i) if fds, err := fdTable.NewFDs(ctx, 0, []*fs.File{file}, FDFlags{}); err != nil || fds[0] != i { t.Fatalf("Allocated %v FDs but wanted to allocate %v: %v", i, maxFD, err) } @@ -93,7 +93,7 @@ func TestFDTableOverLimit(t *testing.T) { t.Fatalf("fdTable.NewFDs(maxFD-3, {f,f,f}): got %v, wanted nil", err) } else { for _, fd := range fds { - fdTable.Remove(fd) + fdTable.Remove(ctx, fd) } } @@ -150,13 +150,13 @@ func TestFDTable(t *testing.T) { t.Fatalf("fdTable.Get(2): got a %v, wanted nil", ref) } - ref, _ := fdTable.Remove(1) + ref, _ := fdTable.Remove(ctx, 1) if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } - ref.DecRef() + ref.DecRef(ctx) - if ref, _ := fdTable.Remove(1); ref != nil { + if ref, _ := fdTable.Remove(ctx, 1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") } }) @@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) { b.StartTimer() // Benchmark. for i := 0; i < b.N; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }) } @@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { defer wg.Done() for i := 0; i < each; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }() } diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 7fd97dc53..3476551f3 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -30,12 +31,21 @@ type descriptorTable struct { slice unsafe.Pointer `state:".(map[int32]*descriptor)"` } -// init initializes the table. -func (f *FDTable) init() { +// initNoLeakCheck initializes the table without enabling leak checking. +// +// This is used when loading an FDTable after S/R, during which the ref count +// object itself will enable leak checking if necessary. +func (f *FDTable) initNoLeakCheck() { var slice []unsafe.Pointer // Empty slice. atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) } +// init initializes the table with leak checking. +func (f *FDTable) init() { + f.initNoLeakCheck() + f.EnableLeakCheck() +} + // get gets a file entry. // // The boolean indicates whether this was in range. @@ -76,33 +86,37 @@ func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, boo return d.file, d.fileVFS2, d.flags, true } -// set sets an entry. -// -// This handles accounting changes, as well as acquiring and releasing the -// reference needed by the table iff the file is different. +// CurrentMaxFDs returns the number of file descriptors that may be stored in f +// without reallocation. +func (f *FDTable) CurrentMaxFDs() int { + slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice)) + return len(slice) +} + +// set sets an entry for VFS1, refer to setAll(). // // Precondition: mu must be held. -func (f *FDTable) set(fd int32, file *fs.File, flags FDFlags) { - f.setAll(fd, file, nil, flags) +func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File { + dropFile, _ := f.setAll(ctx, fd, file, nil, flags) + return dropFile } -// setVFS2 sets an entry. -// -// This handles accounting changes, as well as acquiring and releasing the -// reference needed by the table iff the file is different. +// setVFS2 sets an entry for VFS2, refer to setAll(). // // Precondition: mu must be held. -func (f *FDTable) setVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) { - f.setAll(fd, nil, file, flags) +func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription { + _, dropFile := f.setAll(ctx, fd, nil, file, flags) + return dropFile } -// setAll sets an entry. -// -// This handles accounting changes, as well as acquiring and releasing the -// reference needed by the table iff the file is different. +// setAll sets the file description referred to by fd to file/fileVFS2. If +// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces +// an existing file description, it returns it with the FDTable's reference +// transferred to the caller, which must call f.drop/dropVFS2() on the returned +// file after unlocking f.mu. // // Precondition: mu must be held. -func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { +func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) { if file != nil && fileVFS2 != nil { panic("VFS1 and VFS2 files set") } @@ -145,25 +159,25 @@ func (f *FDTable) setAll(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, } } - // Drop the table reference. + // Adjust used. + switch { + case orig == nil && desc != nil: + atomic.AddInt32(&f.used, 1) + case orig != nil && desc == nil: + atomic.AddInt32(&f.used, -1) + } + if orig != nil { switch { case orig.file != nil: if desc == nil || desc.file != orig.file { - f.drop(orig.file) + return orig.file, nil } case orig.fileVFS2 != nil: if desc == nil || desc.fileVFS2 != orig.fileVFS2 { - f.dropVFS2(orig.fileVFS2) + return nil, orig.fileVFS2 } } } - - // Adjust used. - switch { - case orig == nil && desc != nil: - atomic.AddInt32(&f.used, 1) - case orig != nil && desc == nil: - atomic.AddInt32(&f.used, -1) - } + return nil, nil } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 47f78df9a..41fb2a784 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -17,7 +17,7 @@ package kernel import ( "fmt" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -29,7 +29,7 @@ import ( // // +stateify savable type FSContext struct { - refs.AtomicRefCount + FSContextRefs // mu protects below. mu sync.Mutex `state:"nosave"` @@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { cwd: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } @@ -76,96 +76,104 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { cwdVFS2: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } -// destroy is the destructor for an FSContext. +// DecRef implements RefCounter.DecRef. // -// This will call DecRef on both root and cwd Dirents. If either call to -// DecRef returns an error, then it will be propagated. If both calls to -// DecRef return an error, then the one from root.DecRef will be propagated. +// When f reaches zero references, DecRef will be called on both root and cwd +// Dirents. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy() { - // Hold f.mu so that we don't race with RootDirectory() and - // WorkingDirectory(). - f.mu.Lock() - defer f.mu.Unlock() - - if VFS2Enabled { - f.rootVFS2.DecRef() - f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef() - f.cwdVFS2 = vfs.VirtualDentry{} - } else { - f.root.DecRef() - f.root = nil - f.cwd.DecRef() - f.cwd = nil - } -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FSContext) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FSContext) DecRef(ctx context.Context) { + f.FSContextRefs.DecRef(func() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.rootVFS2.DecRef(ctx) + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef(ctx) + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef(ctx) + f.root = nil + f.cwd.DecRef(ctx) + f.cwd = nil + } + }) } // Fork forks this FSContext. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { + if !f.cwdVFS2.Ok() { + panic("FSContext.Fork() called after destroy") + } f.cwdVFS2.IncRef() f.rootVFS2.IncRef() } else { + if f.cwd == nil { + panic("FSContext.Fork() called after destroy") + } f.cwd.IncRef() f.root.IncRef() } - return &FSContext{ + ctx := &FSContext{ cwd: f.cwd, root: f.root, cwdVFS2: f.cwdVFS2, rootVFS2: f.rootVFS2, umask: f.umask, } + ctx.EnableLeakCheck() + return ctx } // WorkingDirectory returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() - f.cwd.IncRef() + if f.cwd != nil { + f.cwd.IncRef() + } return f.cwd } // WorkingDirectoryVFS2 returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return an empty vfs.VirtualDentry if called after f is +// destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() - f.cwdVFS2.IncRef() + if f.cwdVFS2.Ok() { + f.cwdVFS2.IncRef() + } return f.cwdVFS2 } // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") } @@ -180,27 +188,31 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old := f.cwd f.cwd = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // -// This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() + if !f.cwdVFS2.Ok() { + panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d)) + } + old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // RootDirectory returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -212,21 +224,23 @@ func (f *FSContext) RootDirectory() *fs.Dirent { // RootDirectoryVFS2 returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return an empty vfs.VirtualDentry if called after f is +// destroyed, otherwise it will return a Dirent with a reference taken. func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() - f.rootVFS2.IncRef() + if f.rootVFS2.Ok() { + f.rootVFS2.IncRef() + } return f.rootVFS2 } // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after free. -func (f *FSContext) SetRootDirectory(d *fs.Dirent) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") } @@ -241,13 +255,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old := f.root f.root = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // -// This is not a valid call after free. -func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { +// This is not a valid call after f is destroyed. +func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") } @@ -263,7 +277,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { vd.IncRef() f.rootVFS2 = vd f.mu.Unlock() - old.DecRef() + old.DecRef(ctx) } // Umask returns the current umask. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index c5021f2db..daa2dae76 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -51,6 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ + "//pkg/context", "//pkg/sync", "//pkg/usermem", ], diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 732e66da4..e4dcc4d40 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -19,6 +19,7 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -66,9 +67,9 @@ type Key struct { Offset uint64 } -func (k *Key) release() { +func (k *Key) release(t Target) { if k.MappingIdentity != nil { - k.MappingIdentity.DecRef() + k.MappingIdentity.DecRef(t) } k.Mappable = nil k.MappingIdentity = nil @@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool { // Target abstracts memory accesses and keys. type Target interface { + context.Context + // SwapUint32 gives access to usermem.IO.SwapUint32. SwapUint32(addr usermem.Addr, new uint32) (uint32, error) @@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) { // bucket "to". // // Preconditions: b and to must be locked. -func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { +func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) { @@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { requeued := w w = w.Next() // Next iteration. b.waiters.Remove(requeued) - requeued.key.release() + requeued.key.release(t) requeued.key = nkey.clone() to.waiters.PushBack(requeued) requeued.bucket.Store(to) @@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32 r := b.wakeLocked(&k, bitmask, n) b.mu.Unlock() - k.release() + k.release(t) return r, nil } @@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, naddr, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch done := b1.wakeLocked(&k1, ^uint32(0), nwake) // Requeue the number required. - b1.requeueLocked(b2, &k1, &k2, nreq) + b1.requeueLocked(t, b2, &k1, &k2, nreq) return done, nil } @@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, addr2, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // Perform our atomic check. if err := check(t, addr, val); err != nil { b.mu.Unlock() - w.key.release() + w.key.release(t) return err } @@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // WaitComplete must be called when a Waiter previously added by WaitPrepare is // no longer eligible to be woken. -func (m *Manager) WaitComplete(w *Waiter) { +func (m *Manager) WaitComplete(w *Waiter, t Target) { // Remove w from the bucket it's in. for { b := w.bucket.Load() @@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) { } // Release references held by the waiter. - w.key.release() + w.key.release(t) } // LockPI attempts to lock the futex following the Priority-inheritance futex @@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri success, err := m.lockPILocked(w, t, addr, tid, b, try) if err != nil { - w.key.release() + w.key.release(t) b.mu.Unlock() return false, err } if success || try { // Release waiter if it's not going to be a wait. - w.key.release() + w.key.release(t) } b.mu.Unlock() return success, nil @@ -717,10 +720,10 @@ func (m *Manager) lockPILocked(w *Waiter, t Target, addr usermem.Addr, tid uint3 } } -// UnlockPI unlock the futex following the Priority-inheritance futex -// rules. The address provided must contain the caller's TID. If there are -// waiters, TID of the next waiter (FIFO) is set to the given address, and the -// waiter woken up. If there are no waiters, 0 is set to the address. +// UnlockPI unlocks the futex following the Priority-inheritance futex rules. +// The address provided must contain the caller's TID. If there are waiters, +// TID of the next waiter (FIFO) is set to the given address, and the waiter +// woken up. If there are no waiters, 0 is set to the address. func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool) error { k, err := getKey(t, addr, private) if err != nil { @@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool err = m.unlockPILocked(t, addr, tid, b, &k) - k.release() + k.release(t) b.mu.Unlock() return err } diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 7c5c7665b..d0128c548 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,6 +22,7 @@ import ( "testing" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -29,28 +30,33 @@ import ( // testData implements the Target interface, and allows us to // treat the address passed for futex operations as an index in // a byte slice for testing simplicity. -type testData []byte +type testData struct { + context.Context + data []byte +} const sizeofInt32 = 4 func newTestData(size uint) testData { - return make([]byte, size) + return testData{ + data: make([]byte, size), + } } func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { - val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new) + val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new) return val, nil } func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { - if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) { + if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) { return old, nil } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { @@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup. if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 { @@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup using the wrong bitmask. if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 { @@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) { var ws [3]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform two wakeups. @@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) { // Start two waiters waiting for wakeup on different addresses. w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform two wakeups on the second address. if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 { @@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address 0. if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 { @@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(0), which should succeed). @@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(1), which should fail). @@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(0), which @@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(1), which @@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -472,7 +478,7 @@ func (t *testMutex) Lock() { for { // Attempt to grab the lock. if atomic.CompareAndSwapUint32( - (*uint32)(unsafe.Pointer(&t.d[t.a])), + (*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked, testMutexLocked) { // Lock held. @@ -490,7 +496,7 @@ func (t *testMutex) Lock() { panic("WaitPrepare returned unexpected error: " + err.Error()) } <-w.C - t.m.WaitComplete(w) + t.m.WaitComplete(w, t.d) } } @@ -498,7 +504,7 @@ func (t *testMutex) Lock() { // This will notify any waiters via the futex manager. func (t *testMutex) Unlock() { // Unlock. - atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked) + atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked) // Notify all waiters. t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32) diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 80a070d7e..b87e40dd1 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" @@ -24,6 +25,8 @@ import ( // // +stateify savable type IPCNamespace struct { + IPCNamespaceRefs + // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace @@ -33,11 +36,13 @@ type IPCNamespace struct { // NewIPCNamespace creates a new IPC namespace. func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { - return &IPCNamespace{ + ns := &IPCNamespace{ userNS: userNS, semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } + ns.EnableLeakCheck() + return ns } // SemaphoreRegistry returns the semaphore set registry for this namespace. @@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// DecRef implements refsvfs2.RefCounter.DecRef. +func (i *IPCNamespace) DecRef(ctx context.Context) { + i.IPCNamespaceRefs.DecRef(func() { + i.shms.Release(ctx) + }) +} + // IPCNamespace returns the task's IPC namespace. func (t *Task) IPCNamespace() *IPCNamespace { t.mu.Lock() diff --git a/pkg/sentry/kernel/kcov.go b/pkg/sentry/kernel/kcov.go new file mode 100644 index 000000000..4fcdfc541 --- /dev/null +++ b/pkg/sentry/kernel/kcov.go @@ -0,0 +1,338 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/coverage" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov +// area. On Linux, the maximum is INT_MAX / 8. +const kcovAreaSizeMax = 10 * 1024 * 1024 + +// Kcov provides kernel coverage data to userspace through a memory-mapped +// region, as kcov does in Linux. +// +// To give the illusion that the data is always up to date, we update the shared +// memory every time before we return to userspace. +type Kcov struct { + // mfp provides application memory. It is immutable after creation. + mfp pgalloc.MemoryFileProvider + + // mu protects all of the fields below. + mu sync.RWMutex + + // mode is the current kcov mode. + mode uint8 + + // size is the size of the mapping through which the kernel conveys coverage + // information to userspace. + size uint64 + + // owningTask is the task that currently owns coverage data on the system. The + // interface for kcov essentially requires that coverage is only going to a + // single task. Note that kcov should only generate coverage data for the + // owning task, but we currently generate global coverage. + owningTask *Task + + // count is a locally cached version of the first uint64 in the kcov data, + // which is the number of subsequent entries representing PCs. + // + // It is used with kcovInode.countBlock(), to copy in/out the first element of + // the actual data in an efficient manner, avoid boilerplate, and prevent + // accidental garbage escapes by the temporary counts. + count uint64 + + mappable *mm.SpecialMappable +} + +// NewKcov creates and returns a Kcov instance. +func (k *Kernel) NewKcov() *Kcov { + return &Kcov{ + mfp: k, + } +} + +var coveragePool = sync.Pool{ + New: func() interface{} { + return make([]byte, 0) + }, +} + +// TaskWork implements TaskWorker.TaskWork. +func (kcov *Kcov) TaskWork(t *Task) { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + if kcov.mode != linux.KCOV_MODE_TRACE_PC { + return + } + + rw := &kcovReadWriter{ + mf: kcov.mfp.MemoryFile(), + fr: kcov.mappable.FileRange(), + } + + // Read in the PC count. + if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil { + panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err)) + } + + rw.off = 8 * (1 + kcov.count) + n := coverage.ConsumeCoverageData(&kcovIOWriter{rw}) + + // Update the pc count, based on the number of entries written. Note that if + // we reached the end of the kcov area, we may not have written everything in + // output. + kcov.count += uint64(n / 8) + rw.off = 0 + if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil { + panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err)) + } + + // Re-register for future work. + t.RegisterWork(kcov) +} + +// InitTrace performs the KCOV_INIT_TRACE ioctl. +func (kcov *Kcov) InitTrace(size uint64) error { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + if kcov.mode != linux.KCOV_MODE_DISABLED { + return syserror.EBUSY + } + + // To simplify all the logic around mapping, we require that the length of the + // shared region is a multiple of the system page size. + if (8*size)&(usermem.PageSize-1) != 0 { + return syserror.EINVAL + } + + // We need space for at least two uint64s to hold current position and a + // single PC. + if size < 2 || size > kcovAreaSizeMax { + return syserror.EINVAL + } + + kcov.size = size + kcov.mode = linux.KCOV_MODE_INIT + return nil +} + +// EnableTrace performs the KCOV_ENABLE_TRACE ioctl. +func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error { + t := TaskFromContext(ctx) + if t == nil { + panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine") + } + + kcov.mu.Lock() + defer kcov.mu.Unlock() + + // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call. + if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil { + return syserror.EINVAL + } + + switch traceKind { + case linux.KCOV_TRACE_PC: + kcov.mode = linux.KCOV_MODE_TRACE_PC + case linux.KCOV_TRACE_CMP: + // We do not support KCOV_MODE_TRACE_CMP. + return syserror.ENOTSUP + default: + return syserror.EINVAL + } + + if kcov.owningTask != nil && kcov.owningTask != t { + return syserror.EBUSY + } + + kcov.owningTask = t + t.SetKcov(kcov) + t.RegisterWork(kcov) + + // Clear existing coverage data; the task expects to read only coverage data + // from the time it is activated. + coverage.ClearCoverageData() + return nil +} + +// DisableTrace performs the KCOV_DISABLE_TRACE ioctl. +func (kcov *Kcov) DisableTrace(ctx context.Context) error { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + t := TaskFromContext(ctx) + if t == nil { + panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine") + } + + if t != kcov.owningTask { + return syserror.EINVAL + } + kcov.mode = linux.KCOV_MODE_INIT + kcov.owningTask = nil + if kcov.mappable != nil { + kcov.mappable.DecRef(ctx) + kcov.mappable = nil + } + return nil +} + +// Clear resets the mode and clears the owning task and memory mapping for kcov. +// It is called when the fd corresponding to kcov is closed. Note that the mode +// needs to be set so that the next call to kcov.TaskWork() will exit early. +func (kcov *Kcov) Clear(ctx context.Context) { + kcov.mu.Lock() + kcov.mode = linux.KCOV_MODE_INIT + kcov.owningTask = nil + if kcov.mappable != nil { + kcov.mappable.DecRef(ctx) + kcov.mappable = nil + } + kcov.mu.Unlock() +} + +// OnTaskExit is called when the owning task exits. It is similar to +// kcov.Clear(), except the memory mapping is not cleared, so that the same +// mapping can be used in the future if kcov is enabled again by another task. +func (kcov *Kcov) OnTaskExit() { + kcov.mu.Lock() + kcov.mode = linux.KCOV_MODE_INIT + kcov.owningTask = nil + kcov.mu.Unlock() +} + +// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to +// implement vfs.FileDescription.ConfigureMMap. +func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + kcov.mu.Lock() + defer kcov.mu.Unlock() + + if kcov.mode != linux.KCOV_MODE_INIT { + return syserror.EINVAL + } + + if kcov.mappable == nil { + // Set up the kcov area. + fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous) + if err != nil { + return err + } + + // Get the thread id for the mmap name. + t := TaskFromContext(ctx) + if t == nil { + panic("ThreadFromContext returned nil") + } + // For convenience, a special mappable is used here. Note that these mappings + // will look different under /proc/[pid]/maps than they do on Linux. + kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr) + } + kcov.mappable.IncRef() + opts.Mappable = kcov.mappable + opts.MappingIdentity = kcov.mappable + return nil +} + +// kcovReadWriter implements safemem.Reader and safemem.Writer. +type kcovReadWriter struct { + off uint64 + mf *pgalloc.MemoryFile + fr memmap.FileRange +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + + // Limit the read to the kcov range and check for overflow. + if rw.fr.Length() <= rw.off { + return 0, io.EOF + } + start := rw.fr.Start + rw.off + end := rw.fr.Start + rw.fr.Length() + if rend := start + dsts.NumBytes(); rend < end { + end = rend + } + + // Get internal mappings. + bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Read) + if err != nil { + return 0, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, bs) + rw.off += n + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + + // Limit the write to the kcov area and check for overflow. + if rw.fr.Length() <= rw.off { + return 0, io.EOF + } + start := rw.fr.Start + rw.off + end := rw.fr.Start + rw.fr.Length() + if wend := start + srcs.NumBytes(); wend < end { + end = wend + } + + // Get internal mapping. + bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, usermem.Write) + if err != nil { + return 0, err + } + + // Copy to internal mapping. + n, err := safemem.CopySeq(bs, srcs) + rw.off += n + return n, err +} + +// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter. +type kcovIOWriter struct { + rw *kcovReadWriter +} + +// Write implements io.Writer.Write. +func (w *kcovIOWriter) Write(p []byte) (int, error) { + bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p)) + n, err := safemem.WriteFullFromBlocks(w.rw, bs) + return int(n), err +} diff --git a/pkg/sentry/kernel/kcov_unsafe.go b/pkg/sentry/kernel/kcov_unsafe.go new file mode 100644 index 000000000..6f8a0266b --- /dev/null +++ b/pkg/sentry/kernel/kcov_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "unsafe" + + "gvisor.dev/gvisor/pkg/safemem" +) + +// countBlock provides a safemem.BlockSeq for kcov.count. +// +// Like k.count, the block returned is protected by k.mu. +func (kcov *Kcov) countBlock() safemem.BlockSeq { + return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count)))) +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 5efeb3767..9b2be44d4 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -34,12 +34,12 @@ package kernel import ( "errors" "fmt" - "io" "path/filepath" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" @@ -73,6 +73,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) @@ -81,6 +82,10 @@ import ( // easy access everywhere. To be removed once VFS2 becomes the default. var VFS2Enabled = false +// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow +// easy access everywhere. To be removed once FUSE is completed. +var FUSEEnabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // @@ -194,11 +199,6 @@ type Kernel struct { // cpuClockTickerSetting is protected by runningTasksMu. cpuClockTickerSetting ktime.Setting - // fdMapUids is an ever-increasing counter for generating FDTable uids. - // - // fdMapUids is mutable, and is accessed using atomic memory operations. - fdMapUids uint64 - // uniqueID is used to generate unique identifiers. // // uniqueID is mutable, and is accessed using atomic memory operations. @@ -221,13 +221,18 @@ type Kernel struct { // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints. danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"` - // sockets is the list of all network sockets the system. Protected by - // extMu. + // sockets is the list of all network sockets in the system. + // Protected by extMu. + // TODO(gvisor.dev/issue/1624): Only used by VFS1. sockets socketList - // nextSocketEntry is the next entry number to use in sockets. Protected + // socketsVFS2 records all network sockets in the system. Protected by + // extMu. + socketsVFS2 map[*vfs.FileDescription]*SocketRecord + + // nextSocketRecord is the next entry number to use in sockets. Protected // by extMu. - nextSocketEntry uint64 + nextSocketRecord uint64 // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -249,7 +254,7 @@ type Kernel struct { // SpecialOpts contains special kernel options. SpecialOpts - // VFS keeps the filesystem state used across the kernel. + // vfs keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem // hostMount is the Mount used for file descriptors that were imported @@ -336,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("Timekeeper is nil") } if args.Timekeeper.clocks == nil { - return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") @@ -361,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.useHostCores = true maxCPU, err := hostcpu.MaxPossibleCPU() if err != nil { - return fmt.Errorf("Failed to get maximum CPU number: %v", err) + return fmt.Errorf("failed to get maximum CPU number: %v", err) } minAppCores := uint(maxCPU) + 1 if k.applicationCores < minAppCores { @@ -377,7 +382,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.netlinkPorts = port.New() if VFS2Enabled { - if err := k.vfs.Init(); err != nil { + ctx := k.SupervisorContext() + if err := k.vfs.Init(ctx); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } @@ -385,19 +391,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) } - defer pipeFilesystem.DecRef() + defer pipeFilesystem.DecRef(ctx) pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create pipefs mount: %v", err) } k.pipeMount = pipeMount - tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) if err != nil { return fmt.Errorf("failed to create tmpfs filesystem: %v", err) } - defer tmpfsFilesystem.DecRef() - defer tmpfsRoot.DecRef() + defer tmpfsFilesystem.DecRef(ctx) + defer tmpfsRoot.DecRef(ctx) shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create tmpfs mount: %v", err) @@ -408,12 +414,14 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) } - defer socketFilesystem.DecRef() + defer socketFilesystem.DecRef(ctx) socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create sockfs mount: %v", err) } k.socketMount = socketMount + + k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord) } return nil @@ -422,56 +430,70 @@ func (k *Kernel) Init(args InitKernelArgs) error { // SaveTo saves the state of k to w. // // Preconditions: The kernel must be paused throughout the call to SaveTo. -func (k *Kernel) SaveTo(w io.Writer) error { +func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error { saveStart := time.Now() - ctx := k.SupervisorContext() // Do not allow other Kernel methods to affect it while it's being saved. k.extMu.Lock() defer k.extMu.Unlock() // Stop time. - k.pauseTimeLocked() - defer k.resumeTimeLocked() + k.pauseTimeLocked(ctx) + defer k.resumeTimeLocked(ctx) // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() k.mf.WaitForEvictions() - // Flush write operations on open files so data reaches backing storage. - // This must come after MemoryFile eviction since eviction may cause file - // writes. - if err := k.tasks.flushWritesToFiles(ctx); err != nil { - return err - } + if VFS2Enabled { + // Discard unsavable mappings, such as those for host file descriptors. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } + + // Prepare filesystems for saving. This must be done after + // invalidateUnsavableMappings(), since dropping memory mappings may + // affect filesystem state (e.g. page cache reference counts). + if err := k.vfs.PrepareSave(ctx); err != nil { + return err + } + } else { + // Flush cached file writes to backing storage. This must come after + // MemoryFile eviction since eviction may cause file writes. + if err := k.flushWritesToFiles(ctx); err != nil { + return err + } - // Remove all epoll waiter objects from underlying wait queues. - // NOTE: for programs to resume execution in future snapshot scenarios, - // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters() + // Remove all epoll waiter objects from underlying wait queues. + // NOTE: for programs to resume execution in future snapshot scenarios, + // we will need to re-establish these waiter objects after saving. + k.tasks.unregisterEpollWaiters(ctx) - // Clear the dirent cache before saving because Dirents must be Loaded in a - // particular order (parents before children), and Loading dirents from a cache - // breaks that order. - if err := k.flushMountSourceRefs(); err != nil { - return err - } + // Clear the dirent cache before saving because Dirents must be Loaded in a + // particular order (parents before children), and Loading dirents from a cache + // breaks that order. + if err := k.flushMountSourceRefs(ctx); err != nil { + return err + } - // Ensure that all pending asynchronous work is complete: - // - inode and mount release - // - asynchronuous IO - fs.AsyncBarrier() - - // Once all fs work has completed (flushed references have all been released), - // reset mount mappings. This allows individual mounts to save how inodes map - // to filesystem resources. Without this, fs.Inodes cannot be restored. - fs.SaveInodeMappings() - - // Discard unsavable mappings, such as those for host file descriptors. - // This must be done after waiting for "asynchronous fs work", which - // includes async I/O that may touch application memory. - if err := k.invalidateUnsavableMappings(ctx); err != nil { - return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + // Ensure that all inode and mount release operations have completed. + fs.AsyncBarrier() + + // Once all fs work has completed (flushed references have all been released), + // reset mount mappings. This allows individual mounts to save how inodes map + // to filesystem resources. Without this, fs.Inodes cannot be restored. + fs.SaveInodeMappings() + + // Discard unsavable mappings, such as those for host file descriptors. + // This must be done after waiting for "asynchronous fs work", which + // includes async I/O that may touch application memory. + // + // TODO(gvisor.dev/issue/1624): This rationale is believed to be + // obsolete since AIO callbacks are now waited-for by Kernel.Pause(), + // but this order is conservatively retained for VFS1. + if err := k.invalidateUnsavableMappings(ctx); err != nil { + return fmt.Errorf("failed to invalidate unsavable mappings: %v", err) + } } // Save the CPUID FeatureSet before the rest of the kernel so we can @@ -480,23 +502,23 @@ func (k *Kernel) SaveTo(w io.Writer) error { // // N.B. This will also be saved along with the full kernel save below. cpuidStart := time.Now() - if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil { + if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil { return err } log.Infof("CPUID save took [%s].", time.Since(cpuidStart)) // Save the kernel state. kernelStart := time.Now() - var stats state.Stats - if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil { + stats, err := state.Save(ctx, w, k) + if err != nil { return err } - log.Infof("Kernel save stats: %s", &stats) + log.Infof("Kernel save stats: %s", stats.String()) log.Infof("Kernel save took [%s].", time.Since(kernelStart)) // Save the memory file's state. memoryStart := time.Now() - if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil { + if err := k.mf.SaveTo(ctx, w); err != nil { return err } log.Infof("Memory save took [%s].", time.Since(memoryStart)) @@ -508,7 +530,9 @@ func (k *Kernel) SaveTo(w io.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. -func (k *Kernel) flushMountSourceRefs() error { +// +// Preconditions: !VFS2Enabled. +func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() @@ -524,7 +548,7 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) @@ -534,12 +558,7 @@ func (k *Kernel) flushMountSourceRefs() error { // each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if VFS2Enabled { - return nil - } - +func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) { ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -547,7 +566,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } @@ -556,9 +575,9 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) return err } -func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { +// Preconditions: !VFS2Enabled. +func (k *Kernel) flushWritesToFiles(ctx context.Context) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -580,6 +599,32 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { }) } +// Preconditions: !VFS2Enabled. +func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { + ts.mu.RLock() + defer ts.mu.RUnlock() + + // Tasks that belong to the same process could potentially point to the + // same FDTable. So we retain a map of processed ones to avoid + // processing the same FDTable multiple times. + processed := make(map[*FDTable]struct{}) + for t := range ts.Root.tids { + // We can skip locking Task.mu here since the kernel is paused. + if t.fdTable == nil { + continue + } + if _, ok := processed[t.fdTable]; ok { + continue + } + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if e, ok := file.FileOperations.(*epoll.EventPoll); ok { + e.UnregisterEpollWaiters() + } + }) + processed[t.fdTable] = struct{}{} + } +} + // Preconditions: The kernel must be paused. func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { invalidated := make(map[*mm.MemoryManager]struct{}) @@ -605,38 +650,8 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { return nil } -func (ts *TaskSet) unregisterEpollWaiters() { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if VFS2Enabled { - return - } - - ts.mu.RLock() - defer ts.mu.RUnlock() - - // Tasks that belong to the same process could potentially point to the - // same FDTable. So we retain a map of processed ones to avoid - // processing the same FDTable multiple times. - processed := make(map[*FDTable]struct{}) - for t := range ts.Root.tids { - // We can skip locking Task.mu here since the kernel is paused. - if t.fdTable == nil { - continue - } - if _, ok := processed[t.fdTable]; ok { - continue - } - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if e, ok := file.FileOperations.(*epoll.EventPoll); ok { - e.UnregisterEpollWaiters() - } - }) - processed[t.fdTable] = struct{}{} - } -} - // LoadFrom returns a new Kernel loaded from args. -func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { +func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { loadStart := time.Now() initAppCores := k.applicationCores @@ -647,7 +662,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // don't need to explicitly install it in the Kernel. cpuidStart := time.Now() var features cpuid.FeatureSet - if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil { + if _, err := state.Load(ctx, r, &features); err != nil { return err } log.Infof("CPUID load took [%s].", time.Since(cpuidStart)) @@ -662,11 +677,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the kernel state. kernelStart := time.Now() - var stats state.Stats - if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil { + stats, err := state.Load(ctx, r, k) + if err != nil { return err } - log.Infof("Kernel load stats: %s", &stats) + log.Infof("Kernel load stats: %s", stats.String()) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) // rootNetworkNamespace should be populated after loading the state file. @@ -675,7 +690,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) // Load the memory file's state. memoryStart := time.Now() - if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { + if err := k.mf.LoadFrom(ctx, r); err != nil { return err } log.Infof("Memory load took [%s].", time.Since(memoryStart)) @@ -687,11 +702,17 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) net.Resume() } - // Ensure that all pending asynchronous work is complete: - // - namedpipe opening - // - inode file opening - if err := fs.AsyncErrorBarrier(); err != nil { - return err + if VFS2Enabled { + if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil { + return err + } + } else { + // Ensure that all pending asynchronous work is complete: + // - namedpipe opening + // - inode file opening + if err := fs.AsyncErrorBarrier(); err != nil { + return err + } } tcpip.AsyncLoading.Wait() @@ -820,7 +841,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.args.UTSNamespace case CtxIPCNamespace: - return ctx.args.IPCNamespace + ipcns := ctx.args.IPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: @@ -833,14 +856,16 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { if ctx.args.MountNamespaceVFS2 == nil { return nil } - // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. - return ctx.args.MountNamespaceVFS2.Root() + root := ctx.args.MountNamespaceVFS2.Root() + root.IncRef() + return root case vfs.CtxMountNamespace: if ctx.k.globalInit == nil { return nil } - // MountNamespaceVFS2 takes a reference for us. - return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns.IncRef() + return mntns case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case inet.CtxStack: @@ -890,20 +915,20 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, opener fsbridge.Lookup fsContext *FSContext mntns *fs.MountNamespace + mntnsVFS2 *vfs.MountNamespace ) if VFS2Enabled { - mntnsVFS2 := args.MountNamespaceVFS2 + mntnsVFS2 = args.MountNamespaceVFS2 if mntnsVFS2 == nil { - // MountNamespaceVFS2 adds a reference to the namespace, which is - // transferred to the new process. - mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() + // Add a reference to the namespace, which is transferred to the new process. + mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() + mntnsVFS2.IncRef() } // Get the root directory from the MountNamespace. - root := args.MountNamespaceVFS2.Root() - // The call to newFSContext below will take a reference on root, so we - // don't need to hold this one. - defer root.DecRef() + root := mntnsVFS2.Root() + root.IncRef() + defer root.DecRef(ctx) // Grab the working directory. wd := root // Default. @@ -921,7 +946,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) fsContext = NewFSContextVFS2(root, wd, args.Umask) @@ -936,7 +961,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := mntns.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. remainingTraversals := args.MaxSymlinkTraversals @@ -947,13 +972,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewFSLookup(mntns, root, wd) fsContext = newFSContext(root, wd, args.Umask) } tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + cu := cleanup.Make(func() { + tg.Release(ctx) + }) + defer cu.Clean() // Check which file to start from. switch { @@ -1010,16 +1039,17 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, AbstractSocketNamespace: args.AbstractSocketNamespace, - MountNamespaceVFS2: args.MountNamespaceVFS2, + MountNamespaceVFS2: mntnsVFS2, ContainerID: args.ContainerID, } - t, err := k.tasks.NewTask(config) + t, err := k.tasks.NewTask(ctx, config) if err != nil { return nil, 0, err } t.traceExecEvent(tc) // Simulate exec for tracing. // Success. + cu.Release() tgid := k.tasks.Root.IDOfThreadGroup(tg) if k.globalInit == nil { k.globalInit = tg @@ -1057,7 +1087,7 @@ func (k *Kernel) Start() error { // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. - k.resumeTimeLocked() + k.resumeTimeLocked(k.SupervisorContext()) // Start task goroutines. k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() @@ -1069,9 +1099,10 @@ func (k *Kernel) Start() error { // pauseTimeLocked pauses all Timers and Timekeeper updates. // -// Preconditions: Any task goroutines running in k must be stopped. k.extMu -// must be locked. -func (k *Kernel) pauseTimeLocked() { +// Preconditions: +// * Any task goroutines running in k must be stopped. +// * k.extMu must be locked. +func (k *Kernel) pauseTimeLocked(ctx context.Context) { // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before // Kernel.Start(). if k.cpuClockTicker != nil { @@ -1093,7 +1124,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() @@ -1113,9 +1144,10 @@ func (k *Kernel) pauseTimeLocked() { // pauseTimeLocked has not been previously called, resumeTimeLocked has no // effect. // -// Preconditions: Any task goroutines running in k must be stopped. k.extMu -// must be locked. -func (k *Kernel) resumeTimeLocked() { +// Preconditions: +// * Any task goroutines running in k must be stopped. +// * k.extMu must be locked. +func (k *Kernel) resumeTimeLocked(ctx context.Context) { if k.cpuClockTicker != nil { k.cpuClockTicker.Resume() } @@ -1129,7 +1161,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() @@ -1254,13 +1286,22 @@ func (k *Kernel) Kill(es ExitStatus) { } // Pause requests that all tasks in k temporarily stop executing, and blocks -// until all tasks in k have stopped. Multiple calls to Pause nest and require -// an equal number of calls to Unpause to resume execution. +// until all tasks and asynchronous I/O operations in k have stopped. Multiple +// calls to Pause nest and require an equal number of calls to Unpause to +// resume execution. func (k *Kernel) Pause() { k.extMu.Lock() k.tasks.BeginExternalStop() k.extMu.Unlock() k.tasks.runningGoroutines.Wait() + k.tasks.aioGoroutines.Wait() +} + +// ReceiveTaskStates receives full states for all tasks. +func (k *Kernel) ReceiveTaskStates() { + k.extMu.Lock() + k.tasks.PullFullState() + k.extMu.Unlock() } // Unpause ends the effect of a previous call to Pause. If Unpause is called @@ -1353,8 +1394,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } -// RootIPCNamespace returns the root IPCNamespace. +// RootIPCNamespace takes a reference and returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { + k.rootIPCNamespace.IncRef() return k.rootIPCNamespace } @@ -1470,6 +1512,11 @@ func (k *Kernel) NowMonotonic() int64 { return now } +// AfterFunc implements tcpip.Clock.AfterFunc. +func (k *Kernel) AfterFunc(d time.Duration, f func()) tcpip.Timer { + return ktime.TcpipAfterFunc(k.realtimeClock, d, f) +} + // SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or // LoadFrom. func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) { @@ -1494,20 +1541,27 @@ func (k *Kernel) SupervisorContext() context.Context { } } -// SocketEntry represents a socket recorded in Kernel.sockets. It implements +// SocketRecord represents a socket recorded in Kernel.socketsVFS2. +// +// +stateify savable +type SocketRecord struct { + k *Kernel + Sock *refs.WeakRef // TODO(gvisor.dev/issue/1624): Only used by VFS1. + SockVFS2 *vfs.FileDescription // Only used by VFS2. + ID uint64 // Socket table entry number. +} + +// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements // refs.WeakRefUser for sockets stored in the socket table. // // +stateify savable -type SocketEntry struct { +type SocketRecordVFS1 struct { socketEntry - k *Kernel - Sock *refs.WeakRef - SockVFS2 *vfs.FileDescription - ID uint64 // Socket table entry number. + SocketRecord } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (s *SocketEntry) WeakRefGone() { +func (s *SocketRecordVFS1) WeakRefGone(context.Context) { s.k.extMu.Lock() s.k.sockets.Remove(s) s.k.extMu.Unlock() @@ -1518,9 +1572,14 @@ func (s *SocketEntry) WeakRefGone() { // Precondition: Caller must hold a reference to sock. func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Lock() - id := k.nextSocketEntry - k.nextSocketEntry++ - s := &SocketEntry{k: k, ID: id} + id := k.nextSocketRecord + k.nextSocketRecord++ + s := &SocketRecordVFS1{ + SocketRecord: SocketRecord{ + k: k, + ID: id, + }, + } s.Sock = refs.NewWeakRef(sock, s) k.sockets.PushBack(s) k.extMu.Unlock() @@ -1532,29 +1591,45 @@ func (k *Kernel) RecordSocket(sock *fs.File) { // Precondition: Caller must hold a reference to sock. // // Note that the socket table will not hold a reference on the -// vfs.FileDescription, because we do not support weak refs on VFS2 files. +// vfs.FileDescription. func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { k.extMu.Lock() - id := k.nextSocketEntry - k.nextSocketEntry++ - s := &SocketEntry{ + if _, ok := k.socketsVFS2[sock]; ok { + panic(fmt.Sprintf("Socket %p added twice", sock)) + } + id := k.nextSocketRecord + k.nextSocketRecord++ + s := &SocketRecord{ k: k, ID: id, SockVFS2: sock, } - k.sockets.PushBack(s) + k.socketsVFS2[sock] = s + k.extMu.Unlock() +} + +// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table. +func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + delete(k.socketsVFS2, sock) k.extMu.Unlock() } // ListSockets returns a snapshot of all sockets. // -// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef() // to get a reference on a socket in the table. -func (k *Kernel) ListSockets() []*SocketEntry { +func (k *Kernel) ListSockets() []*SocketRecord { k.extMu.Lock() - var socks []*SocketEntry - for s := k.sockets.Front(); s != nil; s = s.Next() { - socks = append(socks, s) + var socks []*SocketRecord + if VFS2Enabled { + for _, s := range k.socketsVFS2 { + socks = append(socks, s) + } + } else { + for s := k.sockets.Front(); s != nil; s = s.Next() { + socks = append(socks, &s.SocketRecord) + } } k.extMu.Unlock() return socks @@ -1582,7 +1657,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.k.rootUTSNamespace case CtxIPCNamespace: - return ctx.k.rootIPCNamespace + ipcns := ctx.k.rootIPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) @@ -1595,16 +1672,16 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { if ctx.k.globalInit == nil { return vfs.VirtualDentry{} } - mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() - defer mntns.DecRef() - // Root() takes a reference on the root dirent for us. - return mntns.Root() + root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root() + root.IncRef() + return root case vfs.CtxMountNamespace: if ctx.k.globalInit == nil { return nil } - // MountNamespaceVFS2() takes a reference for us. - return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + mntns.IncRef() + return mntns case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case inet.CtxStack: @@ -1685,3 +1762,20 @@ func (k *Kernel) ShmMount() *vfs.Mount { func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount } + +// Release releases resources owned by k. +// +// Precondition: This should only be called after the kernel is fully +// initialized, e.g. after k.Start() has been called. +func (k *Kernel) Release() { + ctx := k.SupervisorContext() + if VFS2Enabled { + k.hostMount.DecRef(ctx) + k.pipeMount.DecRef(ctx) + k.shmMount.DecRef(ctx) + k.socketMount.DecRef(ctx) + k.vfs.Release(ctx) + } + k.timekeeper.Destroy() + k.vdso.Release(ctx) +} diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 7bfa9075a..99134e634 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -21,11 +21,13 @@ go_library( "//pkg/amutex", "//pkg/buffer", "//pkg/context", + "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4b688c627..6497dc4ba 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { - r.DecRef() + r.DecRef(ctx) return nil, syserror.ErrInterrupted } } @@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // On a nonblocking, write-only open, the open fails with ENXIO if the // read side isn't open yet. if flags.NonBlocking { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ErrInterrupted } } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index ab75a87ff..ce0db5583 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) { f := NewInodeOperations(ctx, perms, newNamedPipe(t)) rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) - rFile.DecRef() + rFile.DecRef(ctx) wDone := make(chan struct{}) // This open for write should block because the reader is now gone. diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 79645d7d2..67beb0ad6 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -17,6 +17,7 @@ package pipe import ( "fmt" + "io" "sync/atomic" "syscall" @@ -152,7 +153,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) // The p.Open calls below will each take a reference on the Dirent. We // must drop the one we already have. - defer d.DecRef() + defer d.DecRef(ctx) return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) } @@ -200,22 +201,22 @@ type readOps struct { // // Precondition: this pipe must have readers. func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { - // Don't block for a zero-length read even if the pipe is empty. - if ops.left() == 0 { - return 0, nil - } - p.mu.Lock() defer p.mu.Unlock() return p.readLocked(ctx, ops) } func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { + // Don't block for a zero-length read even if the pipe is empty. + if ops.left() == 0 { + return 0, nil + } + // Is the pipe empty? if p.view.Size() == 0 { if !p.HasWriters() { // There are no writers, return EOF. - return 0, nil + return 0, io.EOF } return 0, syserror.ErrWouldBlock } @@ -388,6 +389,10 @@ func (p *Pipe) rwReadiness() waiter.EventMask { func (p *Pipe) queued() int64 { p.mu.Lock() defer p.mu.Unlock() + return p.queuedLocked() +} + +func (p *Pipe) queuedLocked() int64 { return p.view.Size() } diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index bda739dbe..fe97e9800 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -27,8 +27,8 @@ import ( func TestPipeRW(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") wantN := int64(len(msg)) @@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) { func TestPipeReadBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) if n != 0 || err != syserror.ErrWouldBlock { @@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) @@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index aacf28da2..f665920cb 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -33,7 +34,7 @@ import ( // the old fs architecture. // Release cleans up the pipe's state. -func (p *Pipe) Release() { +func (p *Pipe) Release(context.Context) { p.rClose() p.wClose() @@ -145,9 +146,14 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume v = math.MaxInt32 // Silently truncate. } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + iocc := primitive.IOCopyContext{ + IO: io, + Ctx: ctx, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + } + _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v)) return 0, err default: return 0, syscall.ENOTTY diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go index 7724b4452..ac18785c0 100644 --- a/pkg/sentry/kernel/pipe/reader.go +++ b/pkg/sentry/kernel/pipe/reader.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Reader struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (r *Reader) Release() { +func (r *Reader) Release(context.Context) { r.Pipe.rClose() // Wake up writers. diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 2602bed72..d96bf253b 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -32,6 +33,8 @@ import ( // VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should // not be copied. +// +// +stateify savable type VFSPipe struct { // mu protects the fields below. mu sync.Mutex `state:"nosave"` @@ -61,11 +64,18 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { // // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) + // Connected pipes share the same locks. + locks := &vfs.FileLocks{} + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) +} + +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error { + return syserror.ESPIPE } // Open opens the pipe represented by vp. -func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -75,7 +85,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s return nil, syserror.EINVAL } - fd := vp.newFD(mnt, vfsd, statusFlags) + fd := vp.newFD(mnt, vfsd, statusFlags, locks) // Named pipes have special blocking semantics during open: // @@ -98,7 +108,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } @@ -109,12 +119,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // Non-blocking, write-only opens fail with ENXIO when the read // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } } @@ -127,10 +137,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription { fd := &VFSPipeFD{ pipe: &vp.pipe, } + fd.LockFD.Init(locks) fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, @@ -155,16 +166,19 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) * // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to // other FileDescriptions for splice(2) and tee(2). +// +// +stateify savable type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD pipe *Pipe } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *VFSPipeFD) Release() { +func (fd *VFSPipeFD) Release(context.Context) { var event waiter.EventMask if fd.vfsfd.IsReadable() { fd.pipe.rClose() @@ -195,6 +209,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { } } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ESPIPE +} + // EventRegister implements waiter.Waitable.EventRegister. func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.pipe.EventRegister(e, mask) @@ -222,8 +241,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PipeSize implements fcntl(F_GETPIPE_SZ). func (fd *VFSPipeFD) PipeSize() int64 { - // Inline Pipe.FifoSize() rather than calling it with nil Context and - // fs.File and ignoring the returned error (which is always nil). + // Inline Pipe.FifoSize() since we don't have a fs.File. fd.pipe.mu.Lock() defer fd.pipe.mu.Unlock() return fd.pipe.max @@ -234,19 +252,57 @@ func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { return fd.pipe.SetFifoSize(size) } -// IOSequence returns a useremm.IOSequence that reads up to count bytes from, -// or writes up to count bytes to, fd. -func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence { - return usermem.IOSequence{ +// SpliceToNonPipe performs a splice operation from fd to a non-pipe file. +func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) { + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + + // Cap the sequence at number of bytes actually available. + v := fd.pipe.queuedLocked() + if v < count { + count = v + } + src := usermem.IOSequence{ IO: fd, Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), } + + var ( + n int64 + err error + ) + if off == -1 { + n, err = out.Write(ctx, src, vfs.WriteOptions{}) + } else { + n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{}) + } + if n > 0 { + fd.pipe.view.TrimFront(n) + } + return n, err } -// CopyIn implements usermem.IO.CopyIn. +// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd. +func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) { + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + + dst := usermem.IOSequence{ + IO: fd, + Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + } + + if off == -1 { + return in.Read(ctx, dst, vfs.ReadOptions{}) + } + return in.PRead(ctx, dst, off, vfs.ReadOptions{}) +} + +// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's +// responsibility to trim fd.pipe.view after the read is completed. func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { origCount := int64(len(dst)) - n, err := fd.pipe.read(ctx, readOps{ + n, err := fd.pipe.readLocked(ctx, readOps{ left: func() int64 { return int64(len(dst)) }, @@ -255,7 +311,6 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, }, read: func(view *buffer.View) (int64, error) { n, err := view.ReadAt(dst, 0) - view.TrimFront(int64(n)) return int64(n), err }, }) @@ -271,7 +326,7 @@ func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, // CopyOut implements usermem.IO.CopyOut. func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { origCount := int64(len(src)) - n, err := fd.pipe.write(ctx, writeOps{ + n, err := fd.pipe.writeLocked(ctx, writeOps{ left: func() int64 { return int64(len(src)) }, @@ -295,7 +350,7 @@ func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, // ZeroOut implements usermem.IO.ZeroOut. func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { origCount := toZero - n, err := fd.pipe.write(ctx, writeOps{ + n, err := fd.pipe.writeLocked(ctx, writeOps{ left: func() int64 { return toZero }, @@ -316,14 +371,15 @@ func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int6 return n, err } -// CopyInTo implements usermem.IO.CopyInTo. +// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's +// responsibility to trim fd.pipe.view after the read is completed. func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { count := ars.NumBytes() if count == 0 { return 0, nil } origCount := count - n, err := fd.pipe.read(ctx, readOps{ + n, err := fd.pipe.readLocked(ctx, readOps{ left: func() int64 { return count }, @@ -332,7 +388,6 @@ func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst }, read: func(view *buffer.View) (int64, error) { n, err := view.ReadToSafememWriter(dst, uint64(count)) - view.TrimFront(int64(n)) return int64(n), err }, }) @@ -352,7 +407,7 @@ func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, return 0, nil } origCount := count - n, err := fd.pipe.write(ctx, writeOps{ + n, err := fd.pipe.writeLocked(ctx, writeOps{ left: func() int64 { return count }, @@ -446,3 +501,13 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } return n, err } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go index 5bc6aa931..ef4b70ca3 100644 --- a/pkg/sentry/kernel/pipe/writer.go +++ b/pkg/sentry/kernel/pipe/writer.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Writer struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (w *Writer) Release() { +func (w *Writer) Release(context.Context) { w.Pipe.wClose() // Wake up readers. diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index e23e796ef..1145faf13 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" @@ -224,8 +225,9 @@ func (s *ptraceStop) Killable() bool { // beginPtraceStopLocked does not signal t's tracer or wake it if it is // waiting. // -// Preconditions: The TaskSet mutex must be locked. The caller must be running -// on the task goroutine. +// Preconditions: +// * The TaskSet mutex must be locked. +// * The caller must be running on the task goroutine. func (t *Task) beginPtraceStopLocked() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() @@ -270,8 +272,9 @@ func (t *Task) ptraceTrapLocked(code int32) { // ptraceStop, temporarily preventing it from being removed by a concurrent // Task.Kill, and returns true. Otherwise it returns false. // -// Preconditions: The TaskSet mutex must be locked. The caller must be running -// on the task goroutine of t's tracer. +// Preconditions: +// * The TaskSet mutex must be locked. +// * The caller must be running on the task goroutine of t's tracer. func (t *Task) ptraceFreeze() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() @@ -301,8 +304,9 @@ func (t *Task) ptraceUnfreeze() { t.ptraceUnfreezeLocked() } -// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be -// locked. +// Preconditions: +// * t must be in a frozen ptraceStop. +// * t's signal mutex must be locked. func (t *Task) ptraceUnfreezeLocked() { // Do this even if the task has been killed to ensure a panic if t.stop is // nil or not a ptraceStop. @@ -497,8 +501,9 @@ func (t *Task) forgetTracerLocked() { // ptraceSignalLocked is called after signal dequeueing to check if t should // enter ptrace signal-delivery-stop. // -// Preconditions: The signal mutex must be locked. The caller must be running -// on the task goroutine. +// Preconditions: +// * The signal mutex must be locked. +// * The caller must be running on the task goroutine. func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false @@ -828,8 +833,9 @@ func (t *Task) ptraceInterrupt(target *Task) error { return nil } -// Preconditions: The TaskSet mutex must be locked for writing. t must have a -// tracer. +// Preconditions: +// * The TaskSet mutex must be locked for writing. +// * t must have a tracer. func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { const valid = uintptr(linux.PTRACE_O_EXITKILL | linux.PTRACE_O_TRACESYSGOOD | @@ -994,18 +1000,15 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { // at the address specified by the data parameter, and the return value // is the error flag." - ptrace(2) word := t.Arch().Native(0) - if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{ - IgnorePermissions: true, - }); err != nil { + if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil { return err } - _, err := t.CopyOut(data, word) + _, err := word.CopyOut(t, data) return err case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: - _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{ - IgnorePermissions: true, - }) + word := t.Arch().Native(uintptr(data)) + _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr) return err case linux.PTRACE_GETREGSET: @@ -1018,6 +1021,9 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, @@ -1044,10 +1050,14 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + + mm := t.MemoryManager() + t.p.PullFullState(mm.AddressSpace(), t.Arch()) + ar := ars.Head() n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, - IO: t.MemoryManager(), + IO: mm, Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, @@ -1056,6 +1066,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if err != nil { return err } + t.p.FullStateChanged() ar.End -= usermem.Addr(n) return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) @@ -1065,12 +1076,12 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if target.ptraceSiginfo == nil { return syserror.EINVAL } - _, err := t.CopyOut(data, target.ptraceSiginfo) + _, err := target.ptraceSiginfo.CopyOut(t, data) return err case linux.PTRACE_SETSIGINFO: var info arch.SignalInfo - if _, err := t.CopyIn(data, &info); err != nil { + if _, err := info.CopyIn(t, data); err != nil { return err } t.tg.pidns.owner.mu.RLock() @@ -1085,7 +1096,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { if addr != linux.SignalSetSize { return syserror.EINVAL } - _, err := t.CopyOut(data, target.SignalMask()) + mask := target.SignalMask() + _, err := mask.CopyOut(t, data) return err case linux.PTRACE_SETSIGMASK: @@ -1093,7 +1105,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { return syserror.EINVAL } var mask linux.SignalSet - if _, err := t.CopyIn(data, &mask); err != nil { + if _, err := mask.CopyIn(t, data); err != nil { return err } // The target's task goroutine is stopped, so this is safe: @@ -1108,7 +1120,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { case linux.PTRACE_GETEVENTMSG: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() - _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg) + _, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg) return err // PEEKSIGINFO is unimplemented but seems to have no users anywhere. diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index cef1276ec..609ad3941 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -30,7 +30,7 @@ func (t *Task) ptraceArch(target *Task, req int64, addr, data usermem.Addr) erro if err != nil { return err } - _, err = t.CopyOut(data, n) + _, err = n.CopyOut(t, data) return err case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index 18416643b..2a9023fdf 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -173,8 +173,10 @@ func (t *Task) OldRSeqCPUAddr() usermem.Addr { // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with // t's CPU number. // -// Preconditions: t.RSeqAvailable() == true. The caller must be running on the -// task goroutine. t's AddressSpace must be active. +// Preconditions: +// * t.RSeqAvailable() == true. +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { t.oldRSeqCPUAddr = addr @@ -189,8 +191,9 @@ func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { return nil } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqUpdateCPU() error { if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { t.rseqCPU = -1 @@ -209,8 +212,9 @@ func (t *Task) rseqUpdateCPU() error { return oerr } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) oldRSeqCopyOutCPU() error { if t.oldRSeqCPUAddr == 0 { return nil @@ -222,8 +226,9 @@ func (t *Task) oldRSeqCopyOutCPU() error { return err } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqCopyOutCPU() error { if t.rseqAddr == 0 { return nil @@ -240,8 +245,9 @@ func (t *Task) rseqCopyOutCPU() error { return err } -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqClearCPU() error { buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. @@ -269,8 +275,9 @@ func (t *Task) rseqClearCPU() error { // // See kernel/rseq.c:rseq_ip_fixup for reference. // -// Preconditions: The caller must be running on the task goroutine. t's -// AddressSpace must be active. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) rseqAddrInterrupt() { if t.rseqAddr == 0 { return diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index c38c5a40c..387edfa91 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -18,7 +18,6 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" @@ -27,25 +26,18 @@ import ( const maxSyscallFilterInstructions = 1 << 15 -// seccompData is equivalent to struct seccomp_data, which contains the data -// passed to seccomp-bpf filters. -type seccompData struct { - // nr is the system call number. - nr int32 - - // arch is an AUDIT_ARCH_* value indicating the system call convention. - arch uint32 - - // instructionPointer is the value of the instruction pointer at the time - // of the system call. - instructionPointer uint64 - - // args contains the first 6 system call arguments. - args [6]uint64 -} - -func (d *seccompData) asBPFInput() bpf.Input { - return bpf.InputBytes{binary.Marshal(nil, usermem.ByteOrder, d), usermem.ByteOrder} +// dataAsBPFInput returns a serialized BPF program, only valid on the current task +// goroutine. +// +// Note: this is called for every syscall, which is a very hot path. +func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { + buf := t.CopyScratchBuffer(d.SizeBytes()) + d.MarshalUnsafe(buf) + return bpf.InputBytes{ + Data: buf, + // Go-marshal always uses the native byte order. + Order: usermem.ByteOrder, + } } func seccompSiginfo(t *Task, errno, sysno int32, ip usermem.Addr) *arch.SignalInfo { @@ -112,20 +104,20 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u } func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 { - data := seccompData{ - nr: sysno, - arch: t.tc.st.AuditNumber, - instructionPointer: uint64(ip), + data := linux.SeccompData{ + Nr: sysno, + Arch: t.tc.st.AuditNumber, + InstructionPointer: uint64(ip), } // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so // we can't do any slicing tricks or even use copy/append here. for i, arg := range args { - if i >= len(data.args) { + if i >= len(data.Args) { break } - data.args[i] = arg.Uint64() + data.Args[i] = arg.Uint64() } - input := data.asBPFInput() + input := dataAsBPFInput(t, &data) ret := uint32(linux.SECCOMP_RET_ALLOW) f := t.syscallFilters.Load() diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index c00fa1138..c39ecfb8f 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -283,6 +283,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File return nil } +// GetStat extracts semid_ds information from the set. +func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) { + s.mu.Lock() + defer s.mu.Unlock() + + // "The calling process must have read permission on the semaphore set." + if !s.checkPerms(creds, fs.PermMask{Read: true}) { + return nil, syserror.EACCES + } + + ds := &linux.SemidDS{ + SemPerm: linux.IPCPerm{ + Key: uint32(s.key), + UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), + GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), + CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), + CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), + Mode: uint16(s.perms.LinuxMode()), + Seq: 0, // IPC sequence not supported. + }, + SemOTime: s.opTime.TimeT(), + SemCTime: s.changeTime.TimeT(), + SemNSems: uint64(s.Size()), + } + return ds, nil +} + // SetVal overrides a semaphore value, waking up waiters as needed. func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error { if val < 0 || val > valueMax { @@ -320,7 +347,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti } for _, val := range vals { - if val < 0 || val > valueMax { + if val > valueMax { return syserror.ERANGE } } diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 0e19286de..df5c8421b 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,7 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,7 +30,7 @@ type ProcessGroupID ThreadID // // +stateify savable type Session struct { - refs refs.AtomicRefCount + SessionRefs // leader is the originator of the Session. // @@ -61,16 +60,11 @@ type Session struct { sessionEntry } -// incRef grabs a reference. -func (s *Session) incRef() { - s.refs.IncRef() -} - -// decRef drops a reference. +// DecRef drops a reference. // // Precondition: callers must hold TaskSet.mu for writing. -func (s *Session) decRef() { - s.refs.DecRefWithDestructor(func() { +func (s *Session) DecRef() { + s.SessionRefs.DecRef(func() { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -87,7 +81,7 @@ func (s *Session) decRef() { // // +stateify savable type ProcessGroup struct { - refs refs.AtomicRefCount // not exported. + refs ProcessGroupRefs // originator is the originator of the group. // @@ -162,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(func() { + pg.refs.DecRef(func() { alive = false // don't bother with handleOrphan. // Remove translations from the originator. @@ -174,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { // Remove the list of process groups. pg.session.processGroups.Remove(pg) - pg.session.decRef() + pg.session.DecRef() }) if alive { pg.handleOrphan() @@ -301,7 +295,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } - s.refs.EnableLeakCheck("kernel.Session") + s.EnableLeakCheck() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -315,7 +309,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() // Tie them and return the result. s.processGroups.PushBack(pg) @@ -395,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // // We manually adjust the ancestors if the parent is in the same // session. - tg.processGroup.session.incRef() + tg.processGroup.session.IncRef() pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index bfd779837..80a592c8f 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,12 +1,25 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "shm_refs", + out = "shm_refs.go", + package = "shm", + prefix = "Shm", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "Shm", + }, +) + go_library( name = "shm", srcs = [ "device.go", "shm.go", + "shm_refs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -14,13 +27,13 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index f66cfcc7f..ebbebf46b 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -39,13 +39,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -253,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } - shm.EnableLeakCheck("kernel.Shm") + shm.EnableLeakCheck() // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { @@ -323,9 +321,32 @@ func (r *Registry) remove(s *Shm) { r.totalPages -= s.effectiveSize / usermem.PageSize } +// Release drops the self-reference of each active shm segment in the registry. +// It is called when the kernel.IPCNamespace containing r is being destroyed. +func (r *Registry) Release(ctx context.Context) { + // Because Shm.DecRef() may acquire the same locks, collect the segments to + // release first. Note that this should not race with any updates to r, since + // the IPC namespace containing it has no more references. + toRelease := make([]*Shm, 0) + r.mu.Lock() + for _, s := range r.keysToShms { + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + } + r.mu.Unlock() + + for _, s := range toRelease { + r.dissociateKey(s) + s.DecRef(ctx) + } +} + // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform memory. +// Shm segments are backed directly by an allocation from platform memory. // Segments are always mapped as a whole, greatly simplifying how mappings are // tracked. However note that mremap and munmap calls may cause the vma for a // segment to become fragmented; which requires special care when unmapping a @@ -338,14 +359,14 @@ func (r *Registry) remove(s *Shm) { // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment. + // ShmRefs tracks the number of references to this segment. // // A segment holds a reference to itself until it is marked for // destruction. // // In addition to direct users, the MemoryManager will hold references // via MappingIdentity. - refs.AtomicRefCount + ShmRefs mfp pgalloc.MemoryFileProvider @@ -370,7 +391,7 @@ type Shm struct { // fr is the offset into mfp.MemoryFile() that backs this contents of this // segment. Immutable. - fr platform.FileRange + fr memmap.FileRange // mu protects all fields below. mu sync.Mutex `state:"nosave"` @@ -429,11 +450,14 @@ func (s *Shm) InodeID() uint64 { return uint64(s.ID) } -// DecRef overrides refs.RefCount.DecRef with a destructor. +// DecRef drops a reference on s. // // Precondition: Caller must not hold s.mu. -func (s *Shm) DecRef() { - s.DecRefWithDestructor(s.destroy) +func (s *Shm) DecRef(ctx context.Context) { + s.ShmRefs.DecRef(func() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) + }) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -643,30 +667,28 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy() { - s.mfp.MemoryFile().DecRef(s.fr) - s.registry.remove(s) -} - // MarkDestroyed marks a segment for destruction. The segment is actually // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See // shmctl(IPC_RMID). -func (s *Shm) MarkDestroyed() { +func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() - defer s.mu.Unlock() - if !s.pendingDestruction { - s.pendingDestruction = true - // Drop the self-reference so destruction occurs when all - // external references are gone. - // - // N.B. This cannot be the final DecRef, as the caller also - // holds a reference. - s.DecRef() + if s.pendingDestruction { + s.mu.Unlock() return } + s.pendingDestruction = true + s.mu.Unlock() + + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. + s.DecRef(ctx) + return } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 3eb78e91b..76d472292 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -8,7 +8,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 8243bb93e..78f718cfe 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -17,7 +17,6 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" @@ -76,7 +75,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { } // Release implements fs.FileOperations.Release. -func (s *SignalOperations) Release() {} +func (s *SignalOperations) Release(context.Context) {} // Mask returns the signal mask. func (s *SignalOperations) Mask() linux.SignalSet { @@ -103,8 +102,7 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } // Copy out the signal info using the specified format. - var buf [128]byte - binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + infoNative := linux.SignalfdSiginfo{ Signo: uint32(info.Signo), Errno: info.Errno, Code: info.Code, @@ -113,9 +111,13 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS Status: info.Status(), Overrun: uint32(info.Overrun()), Addr: info.Addr(), - }) - n, err := dst.CopyOut(ctx, buf[:]) - return int64(n), err + } + n, err := infoNative.WriteTo(dst.Writer(ctx)) + if err == usermem.ErrEndOfIOSequence { + // Partial copy-out ok. + err = nil + } + return n, err } // Readiness implements waiter.Waitable.Readiness. diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 413111faf..332bdb8e8 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -348,6 +348,16 @@ func (s *SyscallTable) LookupName(sysno uintptr) string { return fmt.Sprintf("sys_%d", sysno) // Unlikely. } +// LookupNo looks up a syscall number by name. +func (s *SyscallTable) LookupNo(name string) (uintptr, error) { + for i, syscall := range s.Table { + if syscall.Name == name { + return uintptr(i), nil + } + } + return 0, fmt.Errorf("syscall %q not found", name) +} + // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 4607cde2f..a83ce219c 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -98,6 +98,15 @@ func (s *syslog) Log() []byte { s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) } + if VFS2Enabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...) + if FUSEEnabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...) + } + } + time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index f48247c94..037971393 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -68,6 +68,21 @@ type Task struct { // runState is exclusive to the task goroutine. runState taskRunState + // taskWorkCount represents the current size of the task work queue. It is + // used to avoid acquiring taskWorkMu when the queue is empty. + // + // Must accessed with atomic memory operations. + taskWorkCount int32 + + // taskWorkMu protects taskWork. + taskWorkMu sync.Mutex `state:"nosave"` + + // taskWork is a queue of work to be executed before resuming user execution. + // It is similar to the task_work mechanism in Linux. + // + // taskWork is exclusive to the task goroutine. + taskWork []TaskWorker + // haveSyscallReturn is true if tc.Arch().Return() represents a value // returned by a syscall (or set by ptrace after a syscall). // @@ -550,11 +565,20 @@ type Task struct { // futexWaiter is exclusive to the task goroutine. futexWaiter *futex.Waiter `state:"nosave"` + // robustList is a pointer to the head of the tasks's robust futex + // list. + robustList usermem.Addr + // startTime is the real time at which the task started. It is set when // a Task is created or invokes execve(2). // // startTime is protected by mu. startTime ktime.Time + + // kcov is the kcov instance providing code coverage owned by this task. + // + // kcov is exclusive to the task goroutine. + kcov *Kcov } func (t *Task) savePtraceTracer() *Task { @@ -632,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} { case CtxUTSNamespace: return t.utsns case CtxIPCNamespace: - return t.ipcns + ipcns := t.IPCNamespace() + ipcns.IncRef() + return ipcns case CtxTask: return t case auth.CtxCredentials: @@ -711,17 +737,16 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { if VFS2Enabled { realRoot := t.mountNamespaceVFS2.Root() - defer realRoot.DecRef() root := t.fsContext.RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) return root != realRoot } realRoot := t.tg.mounts.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } return root != realRoot } @@ -844,7 +869,6 @@ func (t *Task) MountNamespace() *fs.MountNamespace { func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { t.mu.Lock() defer t.mu.Unlock() - t.mountNamespaceVFS2.IncRef() return t.mountNamespaceVFS2 } @@ -884,3 +908,16 @@ func (t *Task) UID() uint32 { func (t *Task) GID() uint32 { return uint32(t.Credentials().EffectiveKGID) } + +// SetKcov sets the kcov instance associated with t. +func (t *Task) SetKcov(k *Kcov) { + t.kcov = k +} + +// ResetKcov clears the kcov instance associated with t. +func (t *Task) ResetKcov() { + if t.kcov != nil { + t.kcov.OnTaskExit() + t.kcov = nil + } +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index e1ecca99e..682080c14 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -161,6 +162,10 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { return 0, nil, syserror.EINVAL } + // Pull task registers and FPU state, a cloned task will inherit the + // state of the current task. + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a // single clone(2) or unshare(2) call, the user namespace is guaranteed to // be created first, giving the child (clone(2)) or caller (unshare(2)) @@ -199,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" ipcns = NewIPCNamespace(userns) + } else { + ipcns.IncRef() } + cu := cleanup.Make(func() { + ipcns.DecRef(t) + }) + defer cu.Clean() netns := t.NetworkNamespace() if opts.NewNetworkNamespace { @@ -210,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { mntnsVFS2 := t.mountNamespaceVFS2 if mntnsVFS2 != nil { mntnsVFS2.IncRef() + cu.Add(func() { + mntnsVFS2.DecRef(t) + }) } tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err } + cu.Add(func() { + tc.release() + }) // clone() returns 0 in the child. tc.Arch.SetReturn(0) if opts.Stack != 0 { @@ -237,7 +254,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { var fdTable *FDTable if opts.NewFiles { - fdTable = t.fdTable.Fork() + fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable fdTable.IncRef() @@ -291,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else { cfg.InheritParent = t } - nt, err := t.tg.pidns.owner.NewTask(cfg) + nt, err := t.tg.pidns.owner.NewTask(t, cfg) + // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does + // the cleanup for us. + cu.Release() if err != nil { - if opts.NewThreadGroup { - tg.release() - } return 0, nil, err } @@ -337,12 +354,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { nt.SetClearTID(opts.ChildTID) } if opts.ChildSetTID { - // Can't use Task.CopyOut, which assumes AddressSpaceActive. - usermem.CopyObjectOut(t, nt.MemoryManager(), opts.ChildTID, nt.ThreadID(), usermem.IOOpts{}) + ctid := nt.ThreadID() + ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) } ntid := t.tg.pidns.IDOfTask(nt) if opts.ParentSetTID { - t.CopyOut(opts.ParentTID, ntid) + ntid.CopyOut(t, opts.ParentTID) } kind := ptraceCloneKindClone @@ -505,12 +522,13 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" + t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable if opts.NewFiles { oldFDTable = t.fdTable - t.fdTable = oldFDTable.Fork() + t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext if opts.NewFSContext { @@ -519,10 +537,10 @@ func (t *Task) Unshare(opts *SharingOptions) error { } t.mu.Unlock() if oldFDTable != nil { - oldFDTable.DecRef() + oldFDTable.DecRef(t) } if oldFSContext != nil { - oldFSContext.DecRef() + oldFSContext.DecRef(t) } return nil } diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 9fa528384..d1136461a 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -126,7 +126,11 @@ func (t *Task) SyscallTable() *SyscallTable { // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) Stack() *arch.Stack { - return &arch.Stack{t.Arch(), t.MemoryManager(), usermem.Addr(t.Arch().Stack())} + return &arch.Stack{ + Arch: t.Arch(), + IO: t.MemoryManager(), + Bottom: usermem.Addr(t.Arch().Stack()), + } } // LoadTaskImage loads a specified file into a new TaskContext. diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 00c425cca..412d471d3 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -198,11 +198,18 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) t.tg.pidns.owner.mu.Unlock() + oldFDTable := t.fdTable + t.fdTable = t.fdTable.Fork(t) + oldFDTable.DecRef(t) + // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { + t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) + // Handle the robust futex list. + t.exitRobustList() + // NOTE(b/30815691): We currently do not implement privileged // executables (set-user/group-ID bits and file capabilities). This // allows us to unconditionally enable user dumpability on the new mm. @@ -219,6 +226,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tc = *r.tc t.mu.Unlock() t.unstopVforkParent() + t.p.FullStateChanged() // NOTE(b/30316266): All locks must be dropped prior to calling Activate. t.MemoryManager().Activate(t) @@ -229,9 +237,10 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { // promoteLocked makes t the leader of its thread group. If t is already the // thread group leader, promoteLocked is a no-op. // -// Preconditions: All other tasks in t's thread group, including the existing -// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must -// be locked for writing. +// Preconditions: +// * All other tasks in t's thread group, including the existing leader (if it +// is not t), have reached TaskExitZombie. +// * The TaskSet mutex must be locked for writing. func (t *Task) promoteLocked() { oldLeader := t.tg.leader if t == oldLeader { diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index c4ade6e8e..ce7b9641d 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -239,6 +239,8 @@ func (*runExitMain) execute(t *Task) taskRunState { t.traceExitEvent() lastExiter := t.exitThreadGroup() + t.ResetKcov() + // If the task has a cleartid, and the thread group wasn't killed by a // signal, handle that before releasing the MM. if t.cleartid != 0 { @@ -246,13 +248,17 @@ func (*runExitMain) execute(t *Task) taskRunState { signaled := t.tg.exiting && t.tg.exitStatus.Signaled() t.tg.signalHandlers.mu.Unlock() if !signaled { - if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil { + zero := ThreadID(0) + if _, err := zero.CopyOut(t, t.cleartid); err == nil { t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) } // If the CopyOut fails, there's nothing we can do. } } + // Handle the robust futex list. + t.exitRobustList() + // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() @@ -266,20 +272,21 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsContext.DecRef() - t.fdTable.DecRef() + t.fsContext.DecRef(t) + t.fdTable.DecRef(t) t.mu.Lock() if t.mountNamespaceVFS2 != nil { - t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } + t.ipcns.DecRef(t) t.mu.Unlock() // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { - t.tg.release() + t.tg.Release(t) } // Detach tracees. diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index a53e77c9f..c80391475 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -15,6 +15,8 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/usermem" ) @@ -52,3 +54,127 @@ func (t *Task) LoadUint32(addr usermem.Addr) (uint32, error) { func (t *Task) GetSharedKey(addr usermem.Addr) (futex.Key, error) { return t.MemoryManager().GetSharedFutexKey(t, addr) } + +// GetRobustList sets the robust futex list for the task. +func (t *Task) GetRobustList() usermem.Addr { + t.mu.Lock() + addr := t.robustList + t.mu.Unlock() + return addr +} + +// SetRobustList sets the robust futex list for the task. +func (t *Task) SetRobustList(addr usermem.Addr) { + t.mu.Lock() + t.robustList = addr + t.mu.Unlock() +} + +// exitRobustList walks the robust futex list, marking locks dead and notifying +// wakers. It corresponds to Linux's exit_robust_list(). Following Linux, +// errors are silently ignored. +func (t *Task) exitRobustList() { + t.mu.Lock() + addr := t.robustList + t.robustList = 0 + t.mu.Unlock() + + if addr == 0 { + return + } + + var rl linux.RobustListHead + if _, err := rl.CopyIn(t, usermem.Addr(addr)); err != nil { + return + } + + next := primitive.Uint64(rl.List) + done := 0 + var pendingLockAddr usermem.Addr + if rl.ListOpPending != 0 { + pendingLockAddr = usermem.Addr(rl.ListOpPending + rl.FutexOffset) + } + + // Wake up normal elements. + for usermem.Addr(next) != addr { + // We traverse to the next element of the list before we + // actually wake anything. This prevents the race where waking + // this futex causes a modification of the list. + thisLockAddr := usermem.Addr(uint64(next) + rl.FutexOffset) + + // Try to decode the next element in the list before waking the + // current futex. But don't check the error until after we've + // woken the current futex. Linux does it in this order too + _, nextErr := next.CopyIn(t, usermem.Addr(next)) + + // Wakeup the current futex if it's not pending. + if thisLockAddr != pendingLockAddr { + t.wakeRobustListOne(thisLockAddr) + } + + // If there was an error copying the next futex, we must bail. + if nextErr != nil { + break + } + + // This is a user structure, so it could be a massive list, or + // even contain a loop if they are trying to mess with us. We + // cap traversal to prevent that. + done++ + if done >= linux.ROBUST_LIST_LIMIT { + break + } + } + + // Is there a pending entry to wake? + if pendingLockAddr != 0 { + t.wakeRobustListOne(pendingLockAddr) + } +} + +// wakeRobustListOne wakes a single futex from the robust list. +func (t *Task) wakeRobustListOne(addr usermem.Addr) { + // Bit 0 in address signals PI futex. + pi := addr&1 == 1 + addr = addr &^ 1 + + // Load the futex. + f, err := t.LoadUint32(addr) + if err != nil { + // Can't read this single value? Ignore the problem. + // We can wake the other futexes in the list. + return + } + + tid := uint32(t.ThreadID()) + for { + // Is this held by someone else? + if f&linux.FUTEX_TID_MASK != tid { + return + } + + // This thread is dying and it's holding this futex. We need to + // set the owner died bit and wake up any waiters. + newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED + if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil { + return + } else if curF != f { + // Futex changed out from under us. Try again... + f = curF + continue + } + + // Wake waiters if there are any. + if f&linux.FUTEX_WAITERS != 0 { + private := f&linux.FUTEX_PRIVATE_FLAG != 0 + if pi { + t.Futex().UnlockPI(t, addr, tid, private) + return + } + t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1) + } + + // Done. + return + } +} diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index eeccaa197..d23cea802 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -27,6 +27,9 @@ const ( // maxStackDebugBytes is the maximum number of user stack bytes that may be // printed by debugDumpStack. maxStackDebugBytes = 1024 + // maxCodeDebugBytes is the maximum number of user code bytes that may be + // printed by debugDumpCode. + maxCodeDebugBytes = 128 ) // Infof logs an formatted info message by calling log.Infof. @@ -61,6 +64,7 @@ func (t *Task) IsLogging(level log.Level) bool { func (t *Task) DebugDumpState() { t.debugDumpRegisters() t.debugDumpStack() + t.debugDumpCode() if mm := t.MemoryManager(); mm != nil { t.Debugf("Mappings:\n%s", mm) } @@ -128,6 +132,45 @@ func (t *Task) debugDumpStack() { } } +// debugDumpCode logs user code contents at log level debug. +// +// Preconditions: The caller must be running on the task goroutine. +func (t *Task) debugDumpCode() { + if !t.IsLogging(log.Debug) { + return + } + m := t.MemoryManager() + if m == nil { + t.Debugf("Memory manager for task is gone, skipping application code dump.") + return + } + t.Debugf("Code:") + // Print code on both sides of the instruction register. + start := usermem.Addr(t.Arch().IP()) - maxCodeDebugBytes/2 + // Round addr down to a 16-byte boundary. + start &= ^usermem.Addr(15) + // Print 16 bytes per line, one byte at a time. + for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 { + addr, ok := start.AddLength(offset) + if !ok { + break + } + var data [16]byte + n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{ + IgnorePermissions: true, + }) + // Print as much of the line as we can, even if an error was + // encountered. + if n > 0 { + t.Debugf("%x: % x", addr, data[:n]) + } + if err != nil { + t.Debugf("Error reading stack at address %x: %v", addr+usermem.Addr(n), err) + break + } + } +} + // trace definitions. // // Note that all region names are prefixed by ':' in order to ensure that they @@ -203,6 +246,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer file.DecRef() + defer file.DecRef(t) trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index d654dd997..8dc3fec90 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -26,6 +26,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -140,7 +141,7 @@ func (*runApp) handleCPUIDInstruction(t *Task) error { region := trace.StartRegion(t.traceContext, cpuidRegion) expected := arch.CPUIDInstruction[:] found := make([]byte, len(expected)) - _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + _, err := t.CopyInBytes(usermem.Addr(t.Arch().IP()), found) if err == nil && bytes.Equal(expected, found) { // Skip the cpuid instruction. t.Arch().CPUIDEmulate(t) @@ -167,15 +168,30 @@ func (app *runApp) execute(t *Task) taskRunState { return (*runInterrupt)(nil) } - // We're about to switch to the application again. If there's still a + // Execute any task work callbacks before returning to user space. + if atomic.LoadInt32(&t.taskWorkCount) > 0 { + t.taskWorkMu.Lock() + queue := t.taskWork + t.taskWork = nil + atomic.StoreInt32(&t.taskWorkCount, 0) + t.taskWorkMu.Unlock() + + // Do not hold taskWorkMu while executing task work, which may register + // more work. + for _, work := range queue { + work.TaskWork(t) + } + } + + // We're about to switch to the application again. If there's still an // unhandled SyscallRestartErrno that wasn't translated to an EINTR, // restart the syscall that was interrupted. If there's a saved signal // mask, restore it. (Note that restoring the saved signal mask may unblock // a pending signal, causing another interruption, but that signal should // not interact with the interrupted syscall.) if t.haveSyscallReturn { - if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { - if sre == ERESTART_RESTARTBLOCK { + if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre == syserror.ERESTART_RESTARTBLOCK { t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) t.Arch().RestartSyscallWithRestartBlock() } else { @@ -245,7 +261,7 @@ func (app *runApp) execute(t *Task) taskRunState { region := trace.StartRegion(t.traceContext, runRegion) t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) - info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) region.End() diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 09366b60c..52c55d13d 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -133,9 +133,10 @@ func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) { } } -// Preconditions: The caller must be running on the task goroutine, and leaving -// a state indicated by a previous call to -// t.accountTaskGoroutineEnter(state). +// Preconditions: +// * The caller must be running on the task goroutine +// * The caller must be leaving a state indicated by a previous call to +// t.accountTaskGoroutineEnter(state). func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) { if state != TaskGoroutineRunningApp { // Task is unblocking/continuing. @@ -191,8 +192,8 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats { return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow()) } -// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex -// must be locked. +// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus: +// * The TaskSet mutex must be locked. func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats { stats := tg.exitedCPUStats // Account for live tasks. diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 79766cafe..ebdb83061 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -159,7 +159,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS sigact := computeAction(linux.Signal(info.Signo), act) if t.haveSyscallReturn { - if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { // Signals that are ignored, cause a thread group stop, or // terminate the thread group do not interact with interrupted // syscalls; in Linux terms, they are never returned to the signal @@ -168,11 +168,11 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS // signal that is actually handled (by userspace). if sigact == SignalActionHandler { switch { - case sre == ERESTARTNOHAND: + case sre == syserror.ERESTARTNOHAND: fallthrough - case sre == ERESTART_RESTARTBLOCK: + case sre == syserror.ERESTART_RESTARTBLOCK: fallthrough - case (sre == ERESTARTSYS && !act.IsRestart()): + case (sre == syserror.ERESTARTSYS && !act.IsRestart()): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) default: @@ -255,10 +255,15 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) } } + mm := t.MemoryManager() // Set up the signal handler. If we have a saved signal mask, the signal // handler should run with the current mask, but sigreturn should restore // the saved one. - st := &arch.Stack{t.Arch(), t.MemoryManager(), sp} + st := &arch.Stack{ + Arch: t.Arch(), + IO: mm, + Bottom: sp, + } mask := t.signalMask if t.haveSavedSignalMask { mask = t.savedSignalMask @@ -273,12 +278,13 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // Please see the linux code as reference: // linux/arch/arm64/kernel/signal.c:setup_return() if act.Flags&linux.SA_RESTORER == 0 { - act.Restorer = t.MemoryManager().VDSOSigReturn() + act.Restorer = mm.VDSOSigReturn() } if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil { return err } + t.p.FullStateChanged() t.haveSavedSignalMask = false // Add our signal mask. @@ -310,14 +316,16 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked. t.SetSignalMask(sigset &^ UnblockableSignals) + t.p.FullStateChanged() return ctrlResume, nil } // Sigtimedwait implements the semantics of sigtimedwait(2). // -// Preconditions: The caller must be running on the task goroutine. t.exitState -// < TaskExitZombie. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t.exitState < TaskExitZombie. func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) { // set is the set of signals we're interested in; invert it to get the set // of signals to block. @@ -581,8 +589,9 @@ func (t *Task) SignalMask() linux.SignalSet { // SetSignalMask sets t's signal mask. // -// Preconditions: SetSignalMask can only be called by the task goroutine. -// t.exitState < TaskExitZombie. +// Preconditions: +// * The caller must be running on the task goroutine. +// * t.exitState < TaskExitZombie. func (t *Task) SetSignalMask(mask linux.SignalSet) { // By precondition, t prevents t.tg from completing an execve and mutating // t.tg.signalHandlers, so we can skip the TaskSet mutex. @@ -628,7 +637,7 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) { // SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's // comment). // -// Preconditions: SetSavedSignalMask can only be called by the task goroutine. +// Preconditions: The caller must be running on the task goroutine. func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { t.savedSignalMask = mask t.haveSavedSignalMask = true @@ -636,6 +645,7 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { // SignalStack returns the task-private signal stack. func (t *Task) SignalStack() arch.SignalStack { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) alt := t.signalStack if t.onSignalStack(alt) { alt.Flags |= arch.SignalStackFlagOnStack @@ -1050,6 +1060,8 @@ func (*runInterrupt) execute(t *Task) taskRunState { // Are there signals pending? if info := t.dequeueSignalLocked(t.signalMask); info != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { // Indicate that we've dequeued a stop signal before unlocking the // signal mutex; initiateGroupStop will check for races with diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8485fb4b6..8e28230cc 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -98,14 +99,18 @@ type TaskConfig struct { // NewTask creates a new task defined by cfg. // // NewTask does not start the returned task; the caller must call Task.Start. -func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { +// +// If successful, NewTask transfers references held by cfg to the new task. +// Otherwise, NewTask releases them. +func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskContext.release() - cfg.FSContext.DecRef() - cfg.FDTable.DecRef() + cfg.FSContext.DecRef(ctx) + cfg.FDTable.DecRef(ctx) + cfg.IPCNamespace.DecRef(ctx) if cfg.MountNamespaceVFS2 != nil { - cfg.MountNamespaceVFS2.DecRef() + cfg.MountNamespaceVFS2.DecRef(ctx) } return nil, err } diff --git a/pkg/sentry/kernel/task_stop.go b/pkg/sentry/kernel/task_stop.go index 10c6e455c..a35948a5f 100644 --- a/pkg/sentry/kernel/task_stop.go +++ b/pkg/sentry/kernel/task_stop.go @@ -99,8 +99,9 @@ type TaskStop interface { // beginInternalStop indicates the start of an internal stop that applies to t. // -// Preconditions: The task must not already be in an internal stop (i.e. t.stop -// == nil). The caller must be running on the task goroutine. +// Preconditions: +// * The caller must be running on the task goroutine. +// * The task must not already be in an internal stop (i.e. t.stop == nil). func (t *Task) beginInternalStop(s TaskStop) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() @@ -109,8 +110,8 @@ func (t *Task) beginInternalStop(s TaskStop) { t.beginInternalStopLocked(s) } -// Preconditions: The signal mutex must be locked. All preconditions for -// Task.beginInternalStop also apply. +// Preconditions: Same as beginInternalStop, plus: +// * The signal mutex must be locked. func (t *Task) beginInternalStopLocked(s TaskStop) { if t.stop != nil { panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop)) @@ -128,8 +129,9 @@ func (t *Task) beginInternalStopLocked(s TaskStop) { // t.stop, which is why there is no endInternalStop that locks the signal mutex // for you. // -// Preconditions: The signal mutex must be locked. The task must be in an -// internal stop (i.e. t.stop != nil). +// Preconditions: +// * The signal mutex must be locked. +// * The task must be in an internal stop (i.e. t.stop != nil). func (t *Task) endInternalStopLocked() { if t.stop == nil { panic("Attempting to leave non-existent internal stop") @@ -205,6 +207,22 @@ func (ts *TaskSet) BeginExternalStop() { } } +// PullFullState receives full states for all tasks. +func (ts *TaskSet) PullFullState() { + ts.mu.Lock() + defer ts.mu.Unlock() + if ts.Root == nil { + return + } + for t := range ts.Root.tids { + t.Activate() + if mm := t.MemoryManager(); mm != nil { + t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) + } + t.Deactivate() + } +} + // EndExternalStop indicates the end of an external stop started by a previous // call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task // goroutines to resume. diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index a5903b0b5..0141459e7 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -29,75 +30,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel -// include/linux/errno.h. These errnos are never returned to userspace -// directly, but are used to communicate the expected behavior of an -// interrupted syscall from the syscall to signal handling. -type SyscallRestartErrno int - -// These numeric values are significant because ptrace syscall exit tracing can -// observe them. -// -// For all of the following errnos, if the syscall is not interrupted by a -// signal delivered to a user handler, the syscall is restarted. -const ( - // ERESTARTSYS is returned by an interrupted syscall to indicate that it - // should be converted to EINTR if interrupted by a signal delivered to a - // user handler without SA_RESTART set, and restarted otherwise. - ERESTARTSYS = SyscallRestartErrno(512) - - // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it - // should always be restarted. - ERESTARTNOINTR = SyscallRestartErrno(513) - - // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it - // should be converted to EINTR if interrupted by a signal delivered to a - // user handler, and restarted otherwise. - ERESTARTNOHAND = SyscallRestartErrno(514) - - // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate - // that it should be restarted using a custom function. The interrupted - // syscall must register a custom restart function by calling - // Task.SetRestartSyscallFn. - ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) -) - var vsyscallCount = metric.MustCreateNewUint64Metric("/kernel/vsyscall_count", false /* sync */, "Number of times vsyscalls were invoked by the application") -// Error implements error.Error. -func (e SyscallRestartErrno) Error() string { - // Descriptions are borrowed from strace. - switch e { - case ERESTARTSYS: - return "to be restarted if SA_RESTART is set" - case ERESTARTNOINTR: - return "to be restarted" - case ERESTARTNOHAND: - return "to be restarted if no handler" - case ERESTART_RESTARTBLOCK: - return "interrupted by signal" - default: - return "(unknown interrupt error)" - } -} - -// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by -// rv, the value in a syscall return register. -func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) { - switch int(rv) { - case -int(ERESTARTSYS): - return ERESTARTSYS, true - case -int(ERESTARTNOINTR): - return ERESTARTNOINTR, true - case -int(ERESTARTNOHAND): - return ERESTARTNOHAND, true - case -int(ERESTART_RESTARTBLOCK): - return ERESTART_RESTARTBLOCK, true - default: - return 0, false - } -} - // SyscallRestartBlock represents the restart block for a syscall restartable // with a custom function. It encapsulates the state required to restart a // syscall across a S/R. @@ -354,7 +288,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { // Grab the caller up front, to make sure there's a sensible stack. caller := t.Arch().Native(uintptr(0)) - if _, err := t.CopyIn(usermem.Addr(t.Arch().Stack()), caller); err != nil { + if _, err := caller.CopyIn(t, usermem.Addr(t.Arch().Stack())); err != nil { t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) @@ -390,7 +324,7 @@ func (t *Task) doVsyscall(addr usermem.Addr, sysno uintptr) taskRunState { type runVsyscallAfterPtraceEventSeccomp struct { addr usermem.Addr sysno uintptr - caller interface{} + caller marshal.Marshallable } func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { @@ -413,7 +347,7 @@ func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState { return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller) } -func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller interface{}) taskRunState { +func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState { rval, ctrl, err := t.executeSyscall(sysno, args) if ctrl != nil { t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl) @@ -447,7 +381,7 @@ func ExtractErrno(err error, sysno int) int { return 0 case syscall.Errno: return int(err) - case SyscallRestartErrno: + case syserror.SyscallRestartErrno: return int(err) case *memmap.BusError: // Bus errors may generate SIGBUS, but for syscalls they still diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index b02044ad2..ce134bf54 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,17 +44,6 @@ func (t *Task) Deactivate() { } } -// CopyIn copies a fixed-size value or slice of fixed-size values in from the -// task's memory. The copy will fail with syscall.EFAULT if it traverses user -// memory that is unmapped or not readable by the user. -// -// This Task's AddressSpace must be active. -func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error) { - return usermem.CopyObjectIn(t, t.MemoryManager(), addr, dst, usermem.IOOpts{ - AddressSpaceActive: true, - }) -} - // CopyInBytes is a fast version of CopyIn if the caller can serialize the // data without reflection and pass in a byte slice. // @@ -64,17 +54,6 @@ func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { }) } -// CopyOut copies a fixed-size value or slice of fixed-size values out to the -// task's memory. The copy will fail with syscall.EFAULT if it traverses user -// memory that is unmapped or not writeable by the user. -// -// This Task's AddressSpace must be active. -func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error) { - return usermem.CopyObjectOut(t, t.MemoryManager(), addr, src, usermem.IOOpts{ - AddressSpaceActive: true, - }) -} - // CopyOutBytes is a fast version of CopyOut if the caller can serialize the // data without reflection and pass in a byte slice. // @@ -114,7 +93,7 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([ var v []string for { argAddr := t.Arch().Native(0) - if _, err := t.CopyIn(addr, argAddr); err != nil { + if _, err := argAddr.CopyIn(t, addr); err != nil { return v, err } if t.Arch().Value(argAddr) == 0 { @@ -143,8 +122,9 @@ func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([ // CopyOutIovecs converts src to an array of struct iovecs and copies it to the // memory mapped at addr. // -// Preconditions: As for usermem.IO.CopyOut. The caller must be running on the -// task goroutine. t's AddressSpace must be active. +// Preconditions: Same as usermem.IO.CopyOut, plus: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error { switch t.Arch().Width() { case 8: @@ -191,8 +171,9 @@ func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error // combined length of all AddrRanges would otherwise exceed this amount, ranges // beyond MAX_RW_COUNT are silently truncated. // -// Preconditions: As for usermem.IO.CopyIn. The caller must be running on the -// task goroutine. t's AddressSpace must be active. +// Preconditions: Same as usermem.IO.CopyIn, plus: +// * The caller must be running on the task goroutine. +// * t's AddressSpace must be active. func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error) { if numIovecs == 0 { return usermem.AddrRangeSeq{}, nil @@ -284,7 +265,7 @@ func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOp // // IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec(). // -// Preconditions: As for Task.CopyInIovecs. +// Preconditions: Same as Task.CopyInIovecs. func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) { if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV { return usermem.IOSequence{}, syserror.EINVAL @@ -299,3 +280,30 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp Opts: opts, }, nil } + +// copyContext implements marshal.CopyContext. It wraps a task to allow copying +// memory to and from the task memory with custom usermem.IOOpts. +type copyContext struct { + *Task + opts usermem.IOOpts +} + +// AsCopyContext wraps the task and returns it as CopyContext. +func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext { + return ©Context{t, opts} +} + +// CopyInString copies a string in from the task's memory. +func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) { + return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts) +} + +// CopyInBytes copies task memory into dst from an IO context. +func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) { + return t.MemoryManager().CopyIn(t, addr, dst, t.opts) +} + +// CopyOutBytes copies src into task memoryfrom an IO context. +func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) { + return t.MemoryManager().CopyOut(t, addr, src, t.opts) +} diff --git a/pkg/sentry/kernel/task_work.go b/pkg/sentry/kernel/task_work.go new file mode 100644 index 000000000..dda5a433a --- /dev/null +++ b/pkg/sentry/kernel/task_work.go @@ -0,0 +1,38 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import "sync/atomic" + +// TaskWorker is a deferred task. +// +// This must be savable. +type TaskWorker interface { + // TaskWork will be executed prior to returning to user space. Note that + // TaskWork may call RegisterWork again, but this will not be executed until + // the next return to user space, unlike in Linux. This effectively allows + // registration of indefinite user return hooks, but not by default. + TaskWork(t *Task) +} + +// RegisterWork can be used to register additional task work that will be +// performed prior to returning to user space. See TaskWorker.TaskWork for +// semantics regarding registration. +func (t *Task) RegisterWork(work TaskWorker) { + t.taskWorkMu.Lock() + defer t.taskWorkMu.Unlock() + atomic.AddInt32(&t.taskWorkCount, 1) + t.taskWork = append(t.taskWork, work) +} diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 52849f5b3..a183b28c1 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet { return tg.limits } -// release releases the thread group's resources. -func (tg *ThreadGroup) release() { +// Release releases the thread group's resources. +func (tg *ThreadGroup) Release(ctx context.Context) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() @@ -325,7 +326,7 @@ func (tg *ThreadGroup) release() { it.DestroyTimer() } if tg.mounts != nil { - tg.mounts.DecRef() + tg.mounts.DecRef(ctx) } } @@ -366,7 +367,8 @@ func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error { // terminal is stolen, and all processes that had it as controlling // terminal lose it." - tty_ioctl(4) if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session { - if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 { + // Stealing requires CAP_SYS_ADMIN in the root user namespace. + if creds := auth.CredentialsFromContext(tg.leader); !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) || arg != 1 { return syserror.EPERM } // Steal the TTY away. Unlike TIOCNOTTY, don't send signals. diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index bf2dabb6e..fdadb52c0 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -36,6 +36,8 @@ import ( const TasksLimit = (1 << 16) // ThreadID is a generic thread identifier. +// +// +marshal type ThreadID int32 // String returns a decimal representation of the ThreadID. @@ -87,6 +89,13 @@ type TaskSet struct { // at time of save (but note that this is not necessarily the same thing as // sync.WaitGroup's zero value). runningGoroutines sync.WaitGroup `state:"nosave"` + + // aioGoroutines is the number of goroutines running async I/O + // callbacks. + // + // aioGoroutines is not saved but is required to be zero at the time of + // save. + aioGoroutines sync.WaitGroup `state:"nosave"` } // newTaskSet returns a new, empty TaskSet. @@ -256,6 +265,13 @@ func (ns *PIDNamespace) Tasks() []*Task { return tasks } +// NumTasks returns the number of tasks in ns. +func (ns *PIDNamespace) NumTasks() int { + ns.owner.mu.RLock() + defer ns.owner.mu.RUnlock() + return len(ns.tids) +} + // ThreadGroups returns a snapshot of the thread groups in ns. func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup { return ns.ThreadGroupsAppend(nil) diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 7ba7dc50c..2817aa3ba 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -6,6 +6,7 @@ go_library( name = "time", srcs = [ "context.go", + "tcpip.go", "time.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/kernel/time/tcpip.go b/pkg/sentry/kernel/time/tcpip.go new file mode 100644 index 000000000..c4474c0cf --- /dev/null +++ b/pkg/sentry/kernel/time/tcpip.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package time + +import ( + "sync" + "time" +) + +// TcpipAfterFunc waits for duration to elapse according to clock then runs fn. +// The timer is started immediately and will fire exactly once. +func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer { + timer := &TcpipTimer{ + clock: clock, + } + timer.notifier = functionNotifier{ + fn: func() { + // tcpip.Timer.Stop() explicitly states that the function is called in a + // separate goroutine that Stop() does not synchronize with. + // Timer.Destroy() synchronizes with calls to TimerListener.Notify(). + // This is semantically meaningful because, in the former case, it's + // legal to call tcpip.Timer.Stop() while holding locks that may also be + // taken by the function, but this isn't so in the latter case. Most + // immediately, Timer calls TimerListener.Notify() while holding + // Timer.mu. A deadlock occurs without spawning a goroutine: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, deadlock! + // + // Spawning a goroutine avoids the deadlock: + // T1: (Timer expires) + // => Timer.Tick() <- Timer.mu.Lock() called + // => TimerListener.Notify() <- Launches T2 + // T2: + // => Timer.Stop() + // => Timer.Destroy() <- Timer.mu.Lock() called, blocks + // T1: + // => (returns) <- Timer.mu.Unlock() called + // T2: + // => (continues) <- No deadlock! + go func() { + timer.Stop() + fn() + }() + }, + } + timer.Reset(duration) + return timer +} + +// TcpipTimer is a resettable timer with variable duration expirations. +// Implements tcpip.Timer, which does not define a Destroy method; instead, all +// resources are released after timer expiration and calls to Timer.Stop. +// +// Must be created by AfterFunc. +type TcpipTimer struct { + // clock is the time source. clock is immutable. + clock Clock + + // notifier is called when the Timer expires. notifier is immutable. + notifier functionNotifier + + // mu protects t. + mu sync.Mutex + + // t stores the latest running Timer. This is replaced whenever Reset is + // called since Timer cannot be restarted once it has been Destroyed by Stop. + // + // This field is nil iff Stop has been called. + t *Timer +} + +// Stop implements tcpip.Timer.Stop. +func (r *TcpipTimer) Stop() bool { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + return false + } + _, lastSetting := r.t.Swap(Setting{}) + r.t.Destroy() + r.t = nil + return lastSetting.Enabled +} + +// Reset implements tcpip.Timer.Reset. +func (r *TcpipTimer) Reset(d time.Duration) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.t == nil { + r.t = NewTimer(r.clock, &r.notifier) + } + + r.t.Swap(Setting{ + Enabled: true, + Period: 0, + Next: r.clock.Now().Add(d), + }) +} + +// functionNotifier is a TimerListener that runs a function. +// +// functionNotifier cannot be saved or loaded. +type functionNotifier struct { + fn func() +} + +// Notify implements ktime.TimerListener.Notify. +func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) { + f.fn() + return Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (f *functionNotifier) Destroy() {} diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index e959700f2..f61a8e164 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -616,8 +616,10 @@ func (t *Timer) Swap(s Setting) (Time, Setting) { // Timer's Clock) at which the Setting was changed. Setting s.Enabled to true // starts the timer, while setting s.Enabled to false stops it. // -// Preconditions: The Timer must not be paused. f cannot call any Timer methods -// since it is called with the Timer mutex locked. +// Preconditions: +// * The Timer must not be paused. +// * f cannot call any Timer methods since it is called with the Timer mutex +// locked. func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) { now := t.clock.Now() t.mu.Lock() diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index da0ea7bb5..7c4fefb16 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/log" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" ) @@ -90,7 +90,7 @@ type Timekeeper struct { // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. -func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) { return &Timekeeper{ params: NewVDSOParamPage(mfp, paramPage), }, nil @@ -186,6 +186,7 @@ func (t *Timekeeper) startUpdater() { timer := time.NewTicker(sentrytime.ApproxUpdateInterval) t.wg.Add(1) go func() { // S/R-SAFE: stopped during save. + defer t.wg.Done() for { // Start with an update immediately, so the clocks are // ready ASAP. @@ -209,9 +210,6 @@ func (t *Timekeeper) startUpdater() { p.realtimeBaseRef = int64(realtimeParams.BaseRef) p.realtimeFrequency = realtimeParams.Frequency } - - log.Debugf("Updating VDSO parameters: %+v", p) - return p }); err != nil { log.Warningf("Unable to update VDSO parameter page: %v", err) @@ -220,7 +218,6 @@ func (t *Timekeeper) startUpdater() { select { case <-timer.C: case <-t.stop: - t.wg.Done() return } } diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index f1b3c212c..9bc452e67 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -17,10 +17,9 @@ package kernel import ( "fmt" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -28,6 +27,8 @@ import ( // // They are exposed to the VDSO via a parameter page managed by VDSOParamPage, // which also includes a sequence counter. +// +// +marshal type vdsoParams struct { monotonicReady uint64 monotonicBaseCycles int64 @@ -58,7 +59,7 @@ type vdsoParams struct { type VDSOParamPage struct { // The parameter page is fr, allocated from mfp.MemoryFile(). mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange // seq is the current sequence count written to the page. // @@ -68,21 +69,29 @@ type VDSOParamPage struct { // checked in state_test_util tests, causing this field to change across // save / restore. seq uint64 + + // copyScratchBuffer is a temporary buffer used to marshal the params before + // copying it to the real parameter page. The parameter page is typically + // updated at a moderate frequency of ~O(seconds) throughout the lifetime of + // the sentry, so reusing this buffer is a good tradeoff between memory + // usage and the cost of allocation. + copyScratchBuffer []byte } // NewVDSOParamPage returns a VDSOParamPage. // // Preconditions: -// // * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does // not take ownership of fr; it must remain allocated for the lifetime of the // VDSOParamPage. -// // * VDSOParamPage must be the only writer to fr. -// // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. -func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { - return &VDSOParamPage{mfp: mfp, fr: fr} +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage { + return &VDSOParamPage{ + mfp: mfp, + fr: fr, + copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()), + } } // access returns a mapping of the param page. @@ -136,7 +145,8 @@ func (v *VDSOParamPage) Write(f func() vdsoParams) error { // Get the new params. p := f() - buf := binary.Marshal(nil, usermem.ByteOrder, p) + buf := v.copyScratchBuffer[:p.SizeBytes()] + p.MarshalUnsafe(buf) // Skip the sequence counter. if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil { diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go index 77e1fe217..0bade6e57 100644 --- a/pkg/sentry/limits/context.go +++ b/pkg/sentry/limits/context.go @@ -33,3 +33,12 @@ func FromContext(ctx context.Context) *LimitSet { } return nil } + +// FromContextOrDie returns FromContext(ctx) if the latter is not nil. +// Otherwise, panic is triggered. +func FromContextOrDie(ctx context.Context) *LimitSet { + if v := ctx.Value(CtxLimits); v != nil { + return v.(*LimitSet) + } + panic("failed to create limit set from context") +} diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index c6aa65f28..34bdb0b69 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -30,9 +30,6 @@ go_library( "//pkg/rand", "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/fs", - "//pkg/sentry/fs/anon", - "//pkg/sentry/fs/fsutil", "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", @@ -45,6 +42,5 @@ go_library( "//pkg/syserr", "//pkg/syserror", "//pkg/usermem", - "//pkg/waiter", ], ) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 616fafa2c..98af2cc38 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -90,14 +90,23 @@ type elfInfo struct { sharedObject bool } +// fullReader interface extracts the ReadFull method from fsbridge.File so that +// client code does not need to define an entire fsbridge.File when only read +// functionality is needed. +// +// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap +// vfs.FileDescription's PRead/Read instead. +type fullReader interface { + // ReadFull is the same as fsbridge.File.ReadFull. + ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) +} + // parseHeader parse the ELF header, verifying that this is a supported ELF // file and returning the ELF program headers. // // This is similar to elf.NewFile, except that it is more strict about what it // accepts from the ELF, and it doesn't parse unnecessary parts of the file. -// -// ctx may be nil if f does not need it. -func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) { +func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { // Check ident first; it will tell us the endianness of the rest of the // structs. var ident [elf.EI_NIDENT]byte @@ -185,6 +194,10 @@ func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) { log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize) return elfInfo{}, syserror.ENOEXEC } + if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 { + ctx.Infof("Unsupported phdr offset %d", hdr.Phoff) + return elfInfo{}, syserror.ENOEXEC + } phdrBuf := make([]byte, totalPhdrSize) _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) @@ -272,7 +285,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr } defer func() { if mopts.MappingIdentity != nil { - mopts.MappingIdentity.DecRef() + mopts.MappingIdentity.DecRef(ctx) } }() if err := f.ConfigureMMap(ctx, &mopts); err != nil { @@ -393,8 +406,7 @@ type loadedELF struct { // // It does not load the ELF interpreter, or return any auxv entries. // -// Preconditions: -// * f is an ELF file +// Preconditions: f is an ELF file. func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { first := true var start, end usermem.Addr @@ -429,6 +441,10 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz) return loadedELF{}, syserror.ENOEXEC } + if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 { + ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off) + return loadedELF{}, syserror.ENOEXEC + } path := make([]byte, phdr.Filesz) _, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off)) @@ -562,8 +578,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, in // It does not load the ELF interpreter, or return any auxv entries. // // Preconditions: -// * f is an ELF file -// * f is the first ELF loaded into m +// * f is an ELF file. +// * f is the first ELF loaded into m. func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) { info, err := parseHeader(ctx, f) if err != nil { @@ -600,8 +616,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS // // It does not return any auxv entries. // -// Preconditions: -// * f is an ELF file +// Preconditions: f is an ELF file. func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) { info, err := parseHeader(ctx, f) if err != nil { @@ -631,8 +646,7 @@ func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.Fil // If loadELF returns ErrSwitchFile it should be called again with the returned // path and argv. // -// Preconditions: -// * args.File is an ELF file +// Preconditions: args.File is an ELF file. func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) { bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File) if err != nil { @@ -654,7 +668,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer intFile.DecRef() + defer intFile.DecRef(ctx) interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 88449fe95..c69b62db9 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -80,22 +79,6 @@ type LoadArgs struct { Features *cpuid.FeatureSet } -// readFull behaves like io.ReadFull for an *fs.File. -func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - var total int64 - for dst.NumBytes() > 0 { - n, err := f.Preadv(ctx, dst, offset+total) - total += n - if err == io.EOF && total != 0 { - return total, io.ErrUnexpectedEOF - } else if err != nil { - return total, err - } - dst = dst.DropFirst64(n) - } - return total, nil -} - // openPath opens args.Filename and checks that it is valid for loading. // // openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not @@ -139,7 +122,7 @@ func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch if err != nil { return nil, err } - return &arch.Stack{a, m, ar.End}, nil + return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil } const ( @@ -171,7 +154,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context return loadedELF{}, nil, nil, nil, err } // Ensure file is release in case the code loops or errors out. - defer args.File.DecRef() + defer args.File.DecRef(ctx) } else { if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { return loadedELF{}, nil, nil, nil, err @@ -232,20 +215,20 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context // path and argv. // // Preconditions: -// * The Task MemoryManager is empty. -// * Load is called on the Task goroutine. +// * The Task MemoryManager is empty. +// * Load is called on the Task goroutine. func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { // Load the executable itself. loaded, ac, file, newArgv, err := loadExecutable(ctx, args) if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) + return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer file.DecRef() + defer file.DecRef(ctx) // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux()) + return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux()) } // Setup the heap. brk starts at the next page after the end of the @@ -264,20 +247,20 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V } // Push the original filename to the stack, for AT_EXECFN. - execfn, err := stack.Push(args.Filename) - if err != nil { + if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux()) } + execfn := stack.Bottom // Push 16 random bytes on the stack which AT_RANDOM will point to. var b [16]byte if _, err := rand.Read(b[:]); err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux()) } - random, err := stack.Push(b) - if err != nil { + if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux()) } + random := stack.Bottom c := auth.CredentialsFromContext(ctx) @@ -309,7 +292,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(file) + m.SetExecutable(ctx, file) symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn") if err != nil { diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index 00977fc08..241d87835 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -26,10 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/anon" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -37,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" ) const vdsoPrelink = 0xffffffffff700000 @@ -55,52 +50,11 @@ func (f *fileContext) Value(key interface{}) interface{} { } } -// byteReader implements fs.FileOperations for reading from a []byte source. -type byteReader struct { - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoIoctl `state:"nosave"` - fsutil.FileNoMMap `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNoopFlush `state:"nosave"` - fsutil.FileNoopRelease `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FilePipeSeek `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - +type byteFullReader struct { data []byte } -var _ fs.FileOperations = (*byteReader)(nil) - -// newByteReaderFile creates a fake file to read data from. -// -// TODO(gvisor.dev/issue/1623): Convert to VFS2. -func newByteReaderFile(ctx context.Context, data []byte) *fs.File { - // Create a fake inode. - inode := fs.NewInode( - ctx, - &fsutil.SimpleFileInode{}, - fs.NewPseudoMountSource(ctx), - fs.StableAttr{ - Type: fs.Anonymous, - DeviceID: anon.PseudoDevice.DeviceID(), - InodeID: anon.PseudoDevice.NextIno(), - BlockSize: usermem.PageSize, - }) - - // Use the fake inode to create a fake dirent. - dirent := fs.NewTransientDirent(inode) - defer dirent.DecRef() - - // Use the fake dirent to make a fake file. - flags := fs.FileFlags{Read: true, Pread: true} - return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{ - data: data, - }) -} - -func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { +func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { return 0, syserror.EINVAL } @@ -111,10 +65,6 @@ func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequ return int64(n), err } -func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - panic("Write not supported") -} - // validateVDSO checks that the VDSO can be loaded by loadVDSO. // // VDSOs are special (see below). Since we are going to map the VDSO directly @@ -130,7 +80,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq // * PT_LOAD segments don't extend beyond the end of the file. // // ctx may be nil if f does not need it. -func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) { +func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) { info, err := parseHeader(ctx, f) if err != nil { log.Infof("Unable to parse VDSO header: %v", err) @@ -248,13 +198,12 @@ func getSymbolValueFromVDSO(symbol string) (uint64, error) { // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. -func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) { - vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin)) +func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) { + vdsoFile := &byteFullReader{data: vdsoBin} // First make sure the VDSO is valid. vdsoFile does not use ctx, so a // nil context can be passed. info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin))) - vdsoFile.DecRef() if err != nil { return nil, err } @@ -431,3 +380,9 @@ func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) return vdsoAddr, nil } + +// Release drops references on mappings held by v. +func (v *VDSO) Release(ctx context.Context) { + v.ParamPage.DecRef(ctx) + v.vdso.DecRef(ctx) +} diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index a98b66de1..2c95669cd 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -28,9 +28,21 @@ go_template_instance( }, ) +go_template_instance( + name = "file_range", + out = "file_range.go", + package = "memmap", + prefix = "File", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + go_library( name = "memmap", srcs = [ + "file_range.go", "mappable_range.go", "mapping_set.go", "mapping_set_impl.go", @@ -40,7 +52,7 @@ go_library( deps = [ "//pkg/context", "//pkg/log", - "//pkg/sentry/platform", + "//pkg/safemem", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go index d609c1ae0..457ed87f8 100644 --- a/pkg/sentry/memmap/mapping_set.go +++ b/pkg/sentry/memmap/mapping_set.go @@ -177,7 +177,7 @@ func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr // AddMapping adds the given mapping and returns the set of MappableRanges that // previously had no mappings. // -// Preconditions: As for Mappable.AddMapping. +// Preconditions: Same as Mappable.AddMapping. func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange { mr := MappableRange{offset, offset + uint64(ar.Length())} var mapped []MappableRange @@ -204,7 +204,7 @@ func (s *MappingSet) AddMapping(ms MappingSpace, ar usermem.AddrRange, offset ui // RemoveMapping removes the given mapping and returns the set of // MappableRanges that now have no mappings. // -// Preconditions: As for Mappable.RemoveMapping. +// Preconditions: Same as Mappable.RemoveMapping. func (s *MappingSet) RemoveMapping(ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) []MappableRange { mr := MappableRange{offset, offset + uint64(ar.Length())} var unmapped []MappableRange diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c6db9fc8f..7fd77925f 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -19,18 +19,18 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/usermem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 -// offsets to (platform.File, uint64 File offset) pairs. +// offsets to (File, uint64 File offset) pairs. // // See mm/mm.go for Mappable's place in the lock order. // -// Preconditions: For all Mappable methods, usermem.AddrRanges and -// MappableRanges must be non-empty (Length() != 0), and usermem.Addrs and -// Mappable offsets must be page-aligned. +// All Mappable methods have the following preconditions: +// * usermem.AddrRanges and MappableRanges must be non-empty (Length() != 0). +// * usermem.Addrs and Mappable offsets must be page-aligned. type Mappable interface { // AddMapping notifies the Mappable of a mapping from addresses ar in ms to // offsets [offset, offset+ar.Length()) in this Mappable. @@ -48,8 +48,10 @@ type Mappable interface { // addresses ar in ms to offsets [offset, offset+ar.Length()) in this // Mappable. // - // Preconditions: offset+ar.Length() does not overflow. The removed mapping - // must exist. writable must match the corresponding call to AddMapping. + // Preconditions: + // * offset+ar.Length() does not overflow. + // * The removed mapping must exist. writable must match the + // corresponding call to AddMapping. RemoveMapping(ctx context.Context, ms MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) // CopyMapping notifies the Mappable of an attempt to copy a mapping in ms @@ -60,9 +62,10 @@ type Mappable interface { // CopyMapping is only called when a mapping is copied within a given // MappingSpace; it is analogous to Linux's vm_operations_struct::mremap. // - // Preconditions: offset+srcAR.Length() and offset+dstAR.Length() do not - // overflow. The mapping at srcAR must exist. writable must match the - // corresponding call to AddMapping. + // Preconditions: + // * offset+srcAR.Length() and offset+dstAR.Length() do not overflow. + // * The mapping at srcAR must exist. writable must match the + // corresponding call to AddMapping. CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error // Translate returns the Mappable's current mappings for at least the range @@ -74,14 +77,17 @@ type Mappable interface { // Translations are valid until invalidated by a callback to // MappingSpace.Invalidate or until the caller removes its mapping of the // translated range. Mappable implementations must ensure that at least one - // reference is held on all pages in a platform.File that may be the result + // reference is held on all pages in a File that may be the result // of a valid Translation. // - // Preconditions: required.Length() > 0. optional.IsSupersetOf(required). - // required and optional must be page-aligned. The caller must have - // established a mapping for all of the queried offsets via a previous call - // to AddMapping. The caller is responsible for ensuring that calls to - // Translate synchronize with invalidation. + // Preconditions: + // * required.Length() > 0. + // * optional.IsSupersetOf(required). + // * required and optional must be page-aligned. + // * The caller must have established a mapping for all of the queried + // offsets via a previous call to AddMapping. + // * The caller is responsible for ensuring that calls to Translate + // synchronize with invalidation. // // Postconditions: See CheckTranslateResult. Translate(ctx context.Context, required, optional MappableRange, at usermem.AccessType) ([]Translation, error) @@ -100,7 +106,7 @@ type Translation struct { Source MappableRange // File is the mapped file. - File platform.File + File File // Offset is the offset into File at which this Translation begins. Offset uint64 @@ -110,18 +116,18 @@ type Translation struct { Perms usermem.AccessType } -// FileRange returns the platform.FileRange represented by t. -func (t Translation) FileRange() platform.FileRange { - return platform.FileRange{t.Offset, t.Offset + t.Source.Length()} +// FileRange returns the FileRange represented by t. +func (t Translation) FileRange() FileRange { + return FileRange{t.Offset, t.Offset + t.Source.Length()} } // CheckTranslateResult returns an error if (ts, terr) does not satisfy all // postconditions for Mappable.Translate(required, optional, at). // -// Preconditions: As for Mappable.Translate. +// Preconditions: Same as Mappable.Translate. func CheckTranslateResult(required, optional MappableRange, at usermem.AccessType, ts []Translation, terr error) error { // Verify that the inputs to Mappable.Translate were valid. - if !required.WellFormed() || required.Length() <= 0 { + if !required.WellFormed() || required.Length() == 0 { panic(fmt.Sprintf("invalid required range: %v", required)) } if !usermem.Addr(required.Start).IsPageAligned() || !usermem.Addr(required.End).IsPageAligned() { @@ -139,7 +145,7 @@ func CheckTranslateResult(required, optional MappableRange, at usermem.AccessTyp return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required) } for i, t := range ts { - if !t.Source.WellFormed() || t.Source.Length() <= 0 { + if !t.Source.WellFormed() || t.Source.Length() == 0 { return fmt.Errorf("Translation %+v has invalid Source", t) } if !usermem.Addr(t.Source.Start).IsPageAligned() || !usermem.Addr(t.Source.End).IsPageAligned() { @@ -214,7 +220,9 @@ type MappingSpace interface { // Invalidate must not take any locks preceding mm.MemoryManager.activeMu // in the lock order. // - // Preconditions: ar.Length() != 0. ar must be page-aligned. + // Preconditions: + // * ar.Length() != 0. + // * ar must be page-aligned. Invalidate(ar usermem.AddrRange, opts InvalidateOpts) } @@ -238,7 +246,7 @@ type MappingIdentity interface { IncRef() // DecRef decrements the MappingIdentity's reference count. - DecRef() + DecRef(ctx context.Context) // MappedName returns the application-visible name shown in // /proc/[pid]/maps. @@ -360,4 +368,62 @@ type MMapOpts struct { // // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string + + // Force means to skip validation checks of Addr and Length. It can be + // used to create special mappings below mm.layout.MinAddr and + // mm.layout.MaxAddr. It has to be used with caution. + // + // If Force is true, Unmap and Fixed must be true. + Force bool +} + +// File represents a host file that may be mapped into an platform.AddressSpace. +type File interface { + // All pages in a File are reference-counted. + + // IncRef increments the reference count on all pages in fr. + // + // Preconditions: + // * fr.Start and fr.End must be page-aligned. + // * fr.Length() > 0. + // * At least one reference must be held on all pages in fr. (The File + // interface does not provide a way to acquire an initial reference; + // implementors may define mechanisms for doing so.) + IncRef(fr FileRange) + + // DecRef decrements the reference count on all pages in fr. + // + // Preconditions: + // * fr.Start and fr.End must be page-aligned. + // * fr.Length() > 0. + // * At least one reference must be held on all pages in fr. + DecRef(fr FileRange) + + // MapInternal returns a mapping of the given file offsets in the invoking + // process' address space for reading and writing. + // + // Note that fr.Start and fr.End need not be page-aligned. + // + // Preconditions: + // * fr.Length() > 0. + // * At least one reference must be held on all pages in fr. + // + // Postconditions: The returned mapping is valid as long as at least one + // reference is held on the mapped pages. + MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) + + // FD returns the file descriptor represented by the File. + // + // The only permitted operation on the returned file descriptor is to map + // pages from it consistent with the requirements of AddressSpace.MapFile. + FD() int +} + +// FileRange represents a range of uint64 offsets into a File. +// +// type FileRange <generated using go_generics> + +// String implements fmt.Stringer.String. +func (fr FileRange) String() string { + return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) } diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index a036ce53c..6dbeccfe2 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -7,14 +7,14 @@ go_template_instance( name = "file_refcount_set", out = "file_refcount_set.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "mm", prefix = "fileRefcount", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "int32", "Functions": "fileRefcountSetFunctions", }, @@ -73,12 +73,35 @@ go_template_instance( }, ) +go_template_instance( + name = "aio_mappable_refs", + out = "aio_mappable_refs.go", + package = "mm", + prefix = "aioMappable", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "aioMappable", + }, +) + +go_template_instance( + name = "special_mappable_refs", + out = "special_mappable_refs.go", + package = "mm", + prefix = "SpecialMappable", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "SpecialMappable", + }, +) + go_library( name = "mm", srcs = [ "address_space.go", "aio_context.go", "aio_context_state.go", + "aio_mappable_refs.go", "debug.go", "file_refcount_set.go", "io.go", @@ -92,6 +115,7 @@ go_library( "save_restore.go", "shm.go", "special_mappable.go", + "special_mappable_refs.go", "syscalls.go", "vma.go", "vma_set.go", @@ -103,6 +127,7 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safecopy", "//pkg/safemem", "//pkg/sentry/arch", diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index 5c667117c..a93e76c75 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -166,8 +166,12 @@ func (mm *MemoryManager) Deactivate() { // mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings // for all addresses in ar should be precommitted. // -// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. -// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start). +// Preconditions: +// * mm.activeMu must be locked. +// * mm.as != nil. +// * ar.Length() != 0. +// * ar must be page-aligned. +// * pseg == mm.pmas.LowerBoundSegment(ar.Start). func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { // By default, map entire pmas at a time, under the assumption that there // is no cost to mapping more of a pma than necessary. diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 379148903..7bf48cb2c 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -17,10 +17,8 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -240,10 +238,10 @@ func (ctx *AIOContext) Drain() { // // +stateify savable type aioMappable struct { - refs.AtomicRefCount + aioMappableRefs mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange } var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) @@ -254,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { return nil, err } m := aioMappable{mfp: mfp, fr: fr} - m.EnableLeakCheck("mm.aioMappable") + m.EnableLeakCheck() return &m, nil } // DecRef implements refs.RefCounter.DecRef. -func (m *aioMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *aioMappable) DecRef(ctx context.Context) { + m.aioMappableRefs.DecRef(func() { m.mfp.MemoryFile().DecRef(m.fr) }) } @@ -368,7 +366,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ Length: aioRingBufferSize, MappingIdentity: m, diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go index fa776f9c6..a8ac48080 100644 --- a/pkg/sentry/mm/io.go +++ b/pkg/sentry/mm/io.go @@ -441,7 +441,10 @@ func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts // handleASIOFault handles a page fault at address addr for an AddressSpaceIO // operation spanning ioar. // -// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr). +// Preconditions: +// * mm.as != nil. +// * ioar.Length() != 0. +// * ioar.Contains(addr). func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error { // Try to map all remaining pages in the I/O operation. This RoundUp can't // overflow because otherwise it would have been caught by CheckIORange. @@ -629,7 +632,9 @@ func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars userme // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to // truncate usermem.AddrRangeSeq when errors occur. // -// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End. +// Preconditions: +// * !arsit.IsEmpty(). +// * end <= arsit.Head().End. func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq { ar := arsit.Head() if end <= ar.Start { diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index aac56679b..09dbc06a4 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -57,6 +57,8 @@ func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arc // Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or // clone() (without CLONE_VM). func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.AddressSpace().PreFork() + defer mm.AddressSpace().PostFork() mm.metadataMu.Lock() defer mm.metadataMu.Unlock() mm.mappingMu.RLock() @@ -258,7 +260,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) { mm.executable = nil mm.metadataMu.Unlock() if exe != nil { - exe.DecRef() + exe.DecRef(ctx) } mm.activeMu.Lock() diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index 28e5057f7..0cfd60f6c 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -15,6 +15,7 @@ package mm import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" @@ -147,7 +148,7 @@ func (mm *MemoryManager) Executable() fsbridge.File { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(file fsbridge.File) { +func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. @@ -164,7 +165,7 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) { // Do this without holding the lock, since it may wind up doing some // I/O to sync the dirent, etc. if orig != nil { - orig.DecRef() + orig.DecRef(ctx) } } diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 6db7c3d40..92cc87d84 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -25,7 +25,7 @@ // Locks taken by memmap.Mappable.Translate // mm.privateRefs.mu // platform.AddressSpace locks -// platform.File locks +// memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu // @@ -235,6 +235,20 @@ type MemoryManager struct { // vdsoSigReturnAddr is the address of 'vdso_sigreturn'. vdsoSigReturnAddr uint64 + + // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has + // previously been called. Since, as of this writing, + // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory + // barrier, membarrierPrivateEnabled has no other effect. + // + // membarrierPrivateEnabled is accessed using atomic memory operations. + membarrierPrivateEnabled uint32 + + // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously + // been called. + // + // membarrierRSeqEnabled is accessed using atomic memory operations. + membarrierRSeqEnabled uint32 } // vma represents a virtual memory area. @@ -242,7 +256,7 @@ type MemoryManager struct { // +stateify savable type vma struct { // mappable is the virtual memory object mapped by this vma. If mappable is - // nil, the vma represents a private anonymous mapping. + // nil, the vma represents an anonymous mapping. mappable memmap.Mappable // off is the offset into mappable at which this vma begins. If mappable is @@ -396,7 +410,7 @@ type pma struct { // file is the file mapped by this pma. Only pmas for which file == // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to // the corresponding file range while they exist. - file platform.File `state:"nosave"` + file memmap.File `state:"nosave"` // off is the offset into file at which this pma begins. // @@ -436,7 +450,7 @@ type pma struct { private bool // If internalMappings is not empty, it is the cached return value of - // file.MapInternal for the platform.FileRange mapped by this pma. + // file.MapInternal for the memmap.FileRange mapped by this pma. internalMappings safemem.BlockSeq `state:"nosave"` } @@ -469,10 +483,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 { func (fileRefcountSetFunctions) ClearValue(_ *int32) { } -func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { +func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) { return rc1, rc1 == rc2 } -func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { +func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) { return rc, rc } diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index fdc308542..acac3d357 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) { defer mm.DecUsers(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ - Length: 2 * usermem.PageSize, + Length: 2 * usermem.PageSize, + Private: true, }) if err != nil { t.Fatalf("MMap got err %v want nil", err) diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 62e4c20af..7e5f7de64 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -32,10 +31,12 @@ import ( // iterator to the pma containing ar.Start. Otherwise it returns a terminal // iterator. // -// Preconditions: mm.activeMu must be locked. ar.Length() != 0. +// Preconditions: +// * mm.activeMu must be locked. +// * ar.Length() != 0. func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -90,13 +91,16 @@ func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at user // // - An error that is non-nil if pmas exist for only a subset of ar. // -// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for -// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist -// for all addresses in ar, and support accesses of type at (i.e. permission -// checks must have been performed against vmas). +// Preconditions: +// * mm.mappingMu must be locked. +// * mm.activeMu must be locked for writing. +// * ar.Length() != 0. +// * vseg.Range().Contains(ar.Start). +// * vmas must exist for all addresses in ar, and support accesses of type at +// (i.e. permission checks must have been performed against vmas). func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Ok() { @@ -136,9 +140,11 @@ func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar // exist. If this is not equal to ars, it returns a non-nil error explaining // why. // -// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for -// writing. vmas must exist for all addresses in ars, and support accesses of -// type at (i.e. permission checks must have been performed against vmas). +// Preconditions: +// * mm.mappingMu must be locked. +// * mm.activeMu must be locked for writing. +// * vmas must exist for all addresses in ars, and support accesses of type at +// (i.e. permission checks must have been performed against vmas). func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) { for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { ar := arsit.Head() @@ -187,7 +193,7 @@ func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrR // getVecPMAsLocked; other clients should call one of those instead. func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Ok() { @@ -217,7 +223,7 @@ func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIter // Need a pma here. optAR := vseg.Range().Intersect(pgap.Range()) if checkInvariants { - if optAR.Length() <= 0 { + if optAR.Length() == 0 { panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) } } @@ -519,8 +525,10 @@ func privateAligned(ar usermem.AddrRange) usermem.AddrRange { // the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory // and update the pma to indicate that it does not require copy-on-write. // -// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be -// locked. mm.activeMu must be locked for writing. +// Preconditions: +// * vseg.Range().IsSupersetOf(pseg.Range()). +// * mm.mappingMu must be locked. +// * mm.activeMu must be locked for writing. func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { pma := pseg.ValuePtr() if !pma.needCOW { @@ -552,7 +560,7 @@ func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterat // Invalidate implements memmap.MappingSpace.Invalidate. func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -569,11 +577,13 @@ func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.Invalidate // invalidateLocked removes pmas and AddressSpace mappings of those pmas for // addresses in ar. // -// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar -// must be page-aligned. +// Preconditions: +// * mm.activeMu must be locked for writing. +// * ar.Length() != 0. +// * ar must be page-aligned. func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -604,7 +614,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat } } -// Pin returns the platform.File ranges currently mapped by addresses in ar in +// Pin returns the memmap.File ranges currently mapped by addresses in ar in // mm, acquiring a reference on the returned ranges which the caller must // release by calling Unpin. If not all addresses are mapped, Pin returns a // non-nil error. Note that Pin may return both a non-empty slice of @@ -614,10 +624,12 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat // most I/O. It should only be used in contexts that would use get_user_pages() // in the Linux kernel. // -// Preconditions: ar.Length() != 0. ar must be page-aligned. +// Preconditions: +// * ar.Length() != 0. +// * ar must be page-aligned. func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -674,15 +686,15 @@ type PinnedRange struct { Source usermem.AddrRange // File is the mapped file. - File platform.File + File memmap.File // Offset is the offset into File at which this PinnedRange begins. Offset uint64 } -// FileRange returns the platform.File offsets mapped by pr. -func (pr PinnedRange) FileRange() platform.FileRange { - return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} +// FileRange returns the memmap.File offsets mapped by pr. +func (pr PinnedRange) FileRange() memmap.FileRange { + return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} } // Unpin releases the reference held by prs. @@ -694,15 +706,19 @@ func Unpin(prs []PinnedRange) { // movePMAsLocked moves all pmas in oldAR to newAR. // -// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0. -// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR). -// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned. +// Preconditions: +// * mm.activeMu must be locked for writing. +// * oldAR.Length() != 0. +// * oldAR.Length() <= newAR.Length(). +// * !oldAR.Overlaps(newAR). +// * mm.pmas.IsEmptyRange(newAR). +// * oldAR and newAR must be page-aligned. func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) { if checkInvariants { - if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() { + if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() { panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) } - if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() { + if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() { panic(fmt.Sprintf("invalid newAR: %v", newAR)) } if oldAR.Length() > newAR.Length() { @@ -752,15 +768,17 @@ func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) { // - An error that is non-nil if internal mappings exist for only a subset of // ar. // -// Preconditions: mm.activeMu must be locked for writing. -// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar. -// ar.Length() != 0. +// Preconditions: +// * mm.activeMu must be locked for writing. +// * pseg.Range().Contains(ar.Start). +// * pmas must exist for all addresses in ar. +// * ar.Length() != 0. // // Postconditions: getPMAInternalMappingsLocked does not invalidate iterators // into mm.pmas. func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !pseg.Range().Contains(ar.Start) { @@ -784,8 +802,9 @@ func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar userm // internal mappings exist. If this is not equal to ars, it returns a non-nil // error explaining why. // -// Preconditions: mm.activeMu must be locked for writing. pmas must exist for -// all addresses in ar. +// Preconditions: +// * mm.activeMu must be locked for writing. +// * pmas must exist for all addresses in ar. // // Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators // into mm.pmas. @@ -804,12 +823,15 @@ func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSe // internalMappingsLocked returns internal mappings for addresses in ar. // -// Preconditions: mm.activeMu must be locked. Internal mappings must have been -// previously established for all addresses in ar. ar.Length() != 0. -// pseg.Range().Contains(ar.Start). +// Preconditions: +// * mm.activeMu must be locked. +// * Internal mappings must have been previously established for all addresses +// in ar. +// * ar.Length() != 0. +// * pseg.Range().Contains(ar.Start). func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !pseg.Range().Contains(ar.Start) { @@ -840,8 +862,10 @@ func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.Add // vecInternalMappingsLocked returns internal mappings for addresses in ars. // -// Preconditions: mm.activeMu must be locked. Internal mappings must have been -// previously established for all addresses in ars. +// Preconditions: +// * mm.activeMu must be locked. +// * Internal mappings must have been previously established for all addresses +// in ars. func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq { var ims []safemem.Block for ; !ars.IsEmpty(); ars = ars.Tail() { @@ -857,7 +881,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf } // incPrivateRef acquires a reference on private pages in fr. -func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { +func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) { mm.privateRefs.mu.Lock() defer mm.privateRefs.mu.Unlock() refSet := &mm.privateRefs.refs @@ -878,8 +902,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { } // decPrivateRef releases a reference on private pages in fr. -func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { - var freed []platform.FileRange +func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) { + var freed []memmap.FileRange mm.privateRefs.mu.Lock() refSet := &mm.privateRefs.refs @@ -951,7 +975,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa // Discard internal mappings instead of trying to merge them, since merging // them requires an allocation and getting them again from the - // platform.File might not. + // memmap.File might not. pma1.internalMappings = safemem.BlockSeq{} return pma1, true } @@ -970,7 +994,9 @@ func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (p // findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do // so by scanning linearly backward from pgap. // -// Preconditions: mm.activeMu must be locked. addr <= pgap.Start(). +// Preconditions: +// * mm.activeMu must be locked. +// * addr <= pgap.Start(). func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator { if checkInvariants { if !pgap.Ok() { @@ -1012,17 +1038,19 @@ func (pseg pmaIterator) getInternalMappingsLocked() error { return nil } -func (pseg pmaIterator) fileRange() platform.FileRange { +func (pseg pmaIterator) fileRange() memmap.FileRange { return pseg.fileRangeOf(pseg.Range()) } -// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. -func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { +// Preconditions: +// * pseg.Range().IsSupersetOf(ar). +// * ar.Length != 0. +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange { if checkInvariants { if !pseg.Ok() { panic("terminal pma iterator") } - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !pseg.Range().IsSupersetOf(ar) { @@ -1032,5 +1060,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { pma := pseg.ValuePtr() pstart := pseg.Start() - return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} + return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 9ad52082d..2dbe5b751 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -16,10 +16,8 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -32,10 +30,10 @@ import ( // // +stateify savable type SpecialMappable struct { - refs.AtomicRefCount + SpecialMappableRefs mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange name string } @@ -44,15 +42,15 @@ type SpecialMappable struct { // SpecialMappable will use the given name in /proc/[pid]/maps. // // Preconditions: fr.Length() != 0. -func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { +func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mfp: mfp, fr: fr, name: name} - m.EnableLeakCheck("mm.SpecialMappable") + m.EnableLeakCheck() return &m } // DecRef implements refs.RefCounter.DecRef. -func (m *SpecialMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *SpecialMappable) DecRef(ctx context.Context) { + m.SpecialMappableRefs.DecRef(func() { m.mfp.MemoryFile().DecRef(m.fr) }) } @@ -126,7 +124,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider { // FileRange returns the offsets into MemoryFileProvider().MemoryFile() that // store the SpecialMappable's contents. -func (m *SpecialMappable) FileRange() platform.FileRange { +func (m *SpecialMappable) FileRange() memmap.FileRange { return m.fr } @@ -138,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 { // NewSharedAnonMappable returns a SpecialMappable that implements the // semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero. // -// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux -// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should -// do the same to get non-zero device and inode IDs. +// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by +// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses +// a SpecialMappable instead, incorrectly getting device and inode IDs of zero +// and causing memory for shared anonymous mappings to be allocated up-front +// instead of on first touch; this is to avoid exacerbating the fs.MountSource +// leak (b/143656263). Delete this function along with VFS1. func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) { if length == 0 { return nil, syserror.EINVAL diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 3f496aa9f..675efdc7c 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -17,6 +17,7 @@ package mm import ( "fmt" mrand "math/rand" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -24,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -93,18 +93,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme } } else { opts.Offset = 0 - if !opts.Private { - if opts.MappingIdentity != nil { - return 0, syserror.EINVAL - } - m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) - if err != nil { - return 0, err - } - defer m.DecRef() - opts.MappingIdentity = m - opts.Mappable = m - } } if opts.Addr.RoundDown() != opts.Addr { @@ -166,7 +154,9 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // populateVMA obtains pmas for addresses in ar in the given vma, and maps them // into mm.as if it is active. // -// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar). +// Preconditions: +// * mm.mappingMu must be locked. +// * vseg.Range().IsSupersetOf(ar). func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { if !vseg.ValuePtr().effectivePerms.Any() { // Linux doesn't populate inaccessible pages. See @@ -208,8 +198,9 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u // preferable to populateVMA since it unlocks mm.mappingMu before performing // expensive operations that don't require it to be locked. // -// Preconditions: mm.mappingMu must be locked for writing. -// vseg.Range().IsSupersetOf(ar). +// Preconditions: +// * mm.mappingMu must be locked for writing. +// * vseg.Range().IsSupersetOf(ar). // // Postconditions: mm.mappingMu will be unlocked. func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { @@ -1191,7 +1182,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) mm.mappingMu.RUnlock() err := id.Msync(ctx, mr) - id.DecRef() + id.DecRef(ctx) if err != nil { return err } @@ -1284,3 +1275,27 @@ func (mm *MemoryManager) VirtualDataSize() uint64 { defer mm.mappingMu.RUnlock() return mm.dataAS } + +// EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to +// return true. +func (mm *MemoryManager) EnableMembarrierPrivate() { + atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1) +} + +// IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has +// previously been called. +func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool { + return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0 +} + +// EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to +// return true. +func (mm *MemoryManager) EnableMembarrierRSeq() { + atomic.StoreUint32(&mm.membarrierRSeqEnabled, 1) +} + +// IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has +// previously been called. +func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool { + return atomic.LoadUint32(&mm.membarrierRSeqEnabled) != 0 +} diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 16d8207e9..b8df72813 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -27,8 +27,9 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// Preconditions: mm.mappingMu must be locked for writing. opts must be valid -// as defined by the checks in MMap. +// Preconditions: +// * mm.mappingMu must be locked for writing. +// * opts must be valid as defined by the checks in MMap. func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) { if opts.MaxPerms != opts.MaxPerms.Effective() { panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) @@ -42,7 +43,12 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp Map32Bit: opts.Map32Bit, }) if err != nil { - return vmaIterator{}, usermem.AddrRange{}, err + // Can't force without opts.Unmap and opts.Fixed. + if opts.Force && opts.Unmap && opts.Fixed { + addr = opts.Addr + } else { + return vmaIterator{}, usermem.AddrRange{}, err + } } ar, _ := addr.ToRange(opts.Length) @@ -255,11 +261,12 @@ func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 { // // - An error that is non-nil if vmas exist for only a subset of ar. // -// Preconditions: mm.mappingMu must be locked for reading; it may be -// temporarily unlocked. ar.Length() != 0. +// Preconditions: +// * mm.mappingMu must be locked for reading; it may be temporarily unlocked. +// * ar.Length() != 0. func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -337,11 +344,13 @@ const guardBytes = 256 * usermem.PageSize // unmapLocked unmaps all addresses in ar and returns the resulting gap in // mm.vmas. // -// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. -// ar must be page-aligned. +// Preconditions: +// * mm.mappingMu must be locked for writing. +// * ar.Length() != 0. +// * ar must be page-aligned. func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -356,11 +365,13 @@ func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) // gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients // must do so before calling removeVMAsLocked. // -// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar -// must be page-aligned. +// Preconditions: +// * mm.mappingMu must be locked for writing. +// * ar.Length() != 0. +// * ar must be page-aligned. func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { if checkInvariants { - if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } @@ -377,7 +388,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { - vma.id.DecRef() + vma.id.DecRef(ctx) } mm.usageAS -= uint64(vmaAR.Length()) if vma.isPrivateDataLocked() { @@ -446,7 +457,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa } if vma2.id != nil { - vma2.id.DecRef() + vma2.id.DecRef(context.Background()) } return vma1, true } @@ -462,7 +473,9 @@ func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (v return v, v2 } -// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr). +// Preconditions: +// * vseg.ValuePtr().mappable != nil. +// * vseg.Range().Contains(addr). func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 { if checkInvariants { if !vseg.Ok() { @@ -486,8 +499,10 @@ func (vseg vmaIterator) mappableRange() memmap.MappableRange { return vseg.mappableRangeOf(vseg.Range()) } -// Preconditions: vseg.ValuePtr().mappable != nil. -// vseg.Range().IsSupersetOf(ar). ar.Length() != 0. +// Preconditions: +// * vseg.ValuePtr().mappable != nil. +// * vseg.Range().IsSupersetOf(ar). +// * ar.Length() != 0. func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange { if checkInvariants { if !vseg.Ok() { @@ -496,7 +511,7 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan if vseg.ValuePtr().mappable == nil { panic("MappableRange is meaningless for anonymous vma") } - if !ar.WellFormed() || ar.Length() <= 0 { + if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Range().IsSupersetOf(ar) { @@ -509,8 +524,10 @@ func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRan return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} } -// Preconditions: vseg.ValuePtr().mappable != nil. -// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0. +// Preconditions: +// * vseg.ValuePtr().mappable != nil. +// * vseg.mappableRange().IsSupersetOf(mr). +// * mr.Length() != 0. func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange { if checkInvariants { if !vseg.Ok() { @@ -519,7 +536,7 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange { if vseg.ValuePtr().mappable == nil { panic("MappableRange is meaningless for anonymous vma") } - if !mr.WellFormed() || mr.Length() <= 0 { + if !mr.WellFormed() || mr.Length() == 0 { panic(fmt.Sprintf("invalid mr: %v", mr)) } if !vseg.mappableRange().IsSupersetOf(mr) { @@ -535,7 +552,9 @@ func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange { // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by // scanning linearly forward from vseg. // -// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start(). +// Preconditions: +// * mm.mappingMu must be locked. +// * addr >= vseg.Start(). func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator { if checkInvariants { if !vseg.Ok() { diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 1eeb9f317..5b09b9feb 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -33,21 +33,42 @@ go_template_instance( out = "usage_set.go", consts = { "minDegree": "10", + "trackGaps": "1", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "usage", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "usageInfo", "Functions": "usageSetFunctions", }, ) +go_template_instance( + name = "reclaim_set", + out = "reclaim_set.go", + consts = { + "minDegree": "10", + }, + imports = { + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", + }, + package = "pgalloc", + prefix = "reclaim", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "memmap.FileRange", + "Value": "reclaimSetValue", + "Functions": "reclaimSetFunctions", + }, +) + go_library( name = "pgalloc", srcs = [ @@ -56,20 +77,23 @@ go_library( "evictable_range_set.go", "pgalloc.go", "pgalloc_unsafe.go", + "reclaim_set.go", "save_restore.go", "usage_set.go", ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/context", "//pkg/log", "//pkg/memutil", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/hostmm", - "//pkg/sentry/platform", + "//pkg/sentry/memmap", "//pkg/sentry/usage", "//pkg/state", + "//pkg/state/wire", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 2b11ea4ae..7c297fb9e 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -29,18 +29,19 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// MemoryFile is a platform.File whose pages may be allocated to arbitrary +// MemoryFile is a memmap.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { // opts holds options passed to NewMemoryFile. opts is immutable. @@ -108,12 +109,6 @@ type MemoryFile struct { usageSwapped uint64 usageLast time.Time - // minUnallocatedPage is the minimum page that may be unallocated. - // i.e., there are no unallocated pages below minUnallocatedPage. - // - // minUnallocatedPage is protected by mu. - minUnallocatedPage uint64 - // fileSize is the size of the backing memory file in bytes. fileSize is // always a power-of-two multiple of chunkSize. // @@ -146,11 +141,9 @@ type MemoryFile struct { // is protected by mu. reclaimable bool - // minReclaimablePage is the minimum page that may be reclaimable. - // i.e., all reclaimable pages are >= minReclaimablePage. - // - // minReclaimablePage is protected by mu. - minReclaimablePage uint64 + // relcaim is the collection of regions for reclaim. relcaim is protected + // by mu. + reclaim reclaimSet // reclaimCond is signaled (with mu locked) when reclaimable or destroyed // transitions from false to true. @@ -232,6 +225,18 @@ type usageInfo struct { refs uint64 } +// canCommit returns true if the tracked region can be committed. +func (u *usageInfo) canCommit() bool { + // refs must be greater than 0 because we assume that reclaimable pages + // (that aren't already known to be committed) are not committed. This + // isn't necessarily true, even after the reclaimer does Decommit(), + // because the kernel may subsequently back the hugepage-sized region + // containing the decommitted page with a hugepage. However, it's + // consistent with our treatment of unallocated pages, which have the same + // property. + return !u.knownCommitted && u.refs != 0 +} + // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that // may be asked to deallocate that memory in the presence of memory pressure. type EvictableMemoryUser interface { @@ -273,12 +278,10 @@ type evictableMemoryUserInfo struct { } const ( - chunkShift = 24 - chunkSize = 1 << chunkShift // 16 MB + chunkShift = 30 + chunkSize = 1 << chunkShift // 1 GB chunkMask = chunkSize - 1 - initialSize = chunkSize - // maxPage is the highest 64-bit page. maxPage = math.MaxUint64 &^ (usermem.PageSize - 1) ) @@ -302,19 +305,12 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { if err := file.Truncate(0); err != nil { return nil, err } - if err := file.Truncate(initialSize); err != nil { - return nil, err - } f := &MemoryFile{ - opts: opts, - fileSize: initialSize, - file: file, - // No pages are reclaimable. DecRef will always be able to - // decrease minReclaimablePage from this point. - minReclaimablePage: maxPage, - evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), + opts: opts, + file: file, + evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), } - f.mappings.Store(make([]uintptr, initialSize/chunkSize)) + f.mappings.Store(make([]uintptr, 0)) f.reclaimCond.L = &f.mu if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure { @@ -389,7 +385,7 @@ func (f *MemoryFile) Destroy() { // to Allocate. // // Preconditions: length must be page-aligned and non-zero. -func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) { +func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) { if length == 0 || length%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid allocation length: %#x", length)) } @@ -404,48 +400,38 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi alignment = usermem.HugePageSize } - start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment) - end := start + length - // File offsets are int64s. Since length must be strictly positive, end - // cannot legitimately be 0. - if end < start || int64(end) <= 0 { - return platform.FileRange{}, syserror.ENOMEM + // Find a range in the underlying file. + fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment) + if !ok { + return memmap.FileRange{}, syserror.ENOMEM } - // Expand the file if needed. Double the file size on each expansion; - // uncommitted pages have effectively no cost. - fileSize := f.fileSize - for int64(end) > fileSize { - if fileSize >= 2*fileSize { - // fileSize overflow. - return platform.FileRange{}, syserror.ENOMEM + // Expand the file if needed. + if int64(fr.End) > f.fileSize { + // Round the new file size up to be chunk-aligned. + newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask + if err := f.file.Truncate(newFileSize); err != nil { + return memmap.FileRange{}, err } - fileSize *= 2 - } - if fileSize > f.fileSize { - if err := f.file.Truncate(fileSize); err != nil { - return platform.FileRange{}, err - } - f.fileSize = fileSize + f.fileSize = newFileSize f.mappingsMu.Lock() oldMappings := f.mappings.Load().([]uintptr) - newMappings := make([]uintptr, fileSize>>chunkShift) + newMappings := make([]uintptr, newFileSize>>chunkShift) copy(newMappings, oldMappings) f.mappings.Store(newMappings) f.mappingsMu.Unlock() } - // Mark selected pages as in use. - fr := platform.FileRange{start, end} if f.opts.ManualZeroing { if err := f.forEachMappingSlice(fr, func(bs []byte) { for i := range bs { bs[i] = 0 } }); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } } + // Mark selected pages as in use. if !f.usage.Add(fr, usageInfo{ kind: kind, refs: 1, @@ -453,49 +439,79 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage)) } - if minUnallocatedPage < start { - f.minUnallocatedPage = minUnallocatedPage - } else { - // start was the first unallocated page. The next must be - // somewhere beyond end. - f.minUnallocatedPage = end - } - return fr, nil } -// findUnallocatedRange returns the first unallocated page in usage of the -// specified length and alignment beginning at page start and the first single -// unallocated page. -func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) { - // Only searched until the first page is found. - firstPage := start - foundFirstPage := false - alignMask := alignment - 1 - for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() { - r := seg.Range() - - if !foundFirstPage && r.Start > firstPage { - foundFirstPage = true +// findAvailableRange returns an available range in the usageSet. +// +// Note that scanning for available slots takes place from end first backwards, +// then forwards. This heuristic has important consequence for how sequential +// mappings can be merged in the host VMAs, given that addresses for both +// application and sentry mappings are allocated top-down (from higher to +// lower addresses). The file is also grown expoentially in order to create +// space for mappings to be allocated downwards. +// +// Precondition: alignment must be a power of 2. +func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { + alignmentMask := alignment - 1 + + // Search for space in existing gaps, starting at the current end of the + // file and working backward. + lastGap := usage.LastGap() + gap := lastGap + for { + end := gap.End() + if end > uint64(fileSize) { + end = uint64(fileSize) } - if start >= r.End { - // start was rounded up to an alignment boundary from the end - // of a previous segment and is now beyond r.End. - continue + // Try to allocate from the end of this gap, with the start of the + // allocated range aligned down to alignment. + unalignedStart := end - length + if unalignedStart > end { + // Negative overflow: this and all preceding gaps are too small to + // accommodate length. + break } - // This segment represents allocated or reclaimable pages; only the - // range from start to the segment's beginning is allocatable, and the - // next allocatable range begins after the segment. - if r.Start > start && r.Start-start >= length { + if start := unalignedStart &^ alignmentMask; start >= gap.Start() { + return memmap.FileRange{start, start + length}, true + } + + gap = gap.PrevLargeEnoughGap(length) + if !gap.Ok() { break } - start = (r.End + alignMask) &^ alignMask - if !foundFirstPage { - firstPage = r.End + } + + // Check that it's possible to fit this allocation at the end of a file of any size. + min := lastGap.Start() + min = (min + alignmentMask) &^ alignmentMask + if min+length < min { + // Overflow: allocation would exceed the range of uint64. + return memmap.FileRange{}, false + } + + // Determine the minimum file size required to fit this allocation at its end. + for { + newFileSize := 2 * fileSize + if newFileSize <= fileSize { + if fileSize != 0 { + // Overflow: allocation would exceed the range of int64. + return memmap.FileRange{}, false + } + newFileSize = chunkSize + } + fileSize = newFileSize + + unalignedStart := uint64(fileSize) - length + if unalignedStart > uint64(fileSize) { + // Negative overflow: fileSize is still inadequate. + continue + } + if start := unalignedStart &^ alignmentMask; start >= min { + return memmap.FileRange{start, start + length}, true } } - return start, firstPage } // AllocateAndFill allocates memory of the given kind and fills it by calling @@ -504,23 +520,25 @@ func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uin // nearest page. If this is shorter than length bytes due to an error returned // by r.ReadToBlocks(), it returns that error. // -// Preconditions: length > 0. length must be page-aligned. -func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) { +// Preconditions: +// * length > 0. +// * length must be page-aligned. +func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) { fr, err := f.Allocate(length, kind) if err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } dsts, err := f.MapInternal(fr, usermem.Write) if err != nil { f.DecRef(fr) - return platform.FileRange{}, err + return memmap.FileRange{}, err } n, err := safemem.ReadFullToBlocks(r, dsts) un := uint64(usermem.Addr(n).RoundDown()) if un < length { // Free unused memory and update fr to contain only the memory that is // still allocated. - f.DecRef(platform.FileRange{fr.Start + un, fr.End}) + f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) fr.End = fr.Start + un } return fr, err @@ -537,7 +555,7 @@ const ( // will read zeroes. // // Preconditions: fr.Length() > 0. -func (f *MemoryFile) Decommit(fr platform.FileRange) error { +func (f *MemoryFile) Decommit(fr memmap.FileRange) error { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -557,7 +575,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error { return nil } -func (f *MemoryFile) markDecommitted(fr platform.FileRange) { +func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() // Since we're changing the knownCommitted attribute, we need to merge @@ -578,8 +596,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) { f.usage.MergeRange(fr) } -// IncRef implements platform.File.IncRef. -func (f *MemoryFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (f *MemoryFile) IncRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -597,8 +615,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) { f.usage.MergeAdjacent(fr) } -// DecRef implements platform.File.DecRef. -func (f *MemoryFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (f *MemoryFile) DecRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -616,6 +634,7 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) { } val.refs-- if val.refs == 0 { + f.reclaim.Add(seg.Range(), reclaimSetValue{}) freed = true // Reclassify memory as System, until it's freed by the reclaim // goroutine. @@ -628,17 +647,13 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) { f.usage.MergeAdjacent(fr) if freed { - if fr.Start < f.minReclaimablePage { - // We've freed at least one lower page. - f.minReclaimablePage = fr.Start - } f.reclaimable = true f.reclaimCond.Signal() } } -// MapInternal implements platform.File.MapInternal. -func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { if !fr.WellFormed() || fr.Length() == 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -664,7 +679,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) ( // forEachMappingSlice invokes fn on a sequence of byte slices that // collectively map all bytes in fr. -func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error { +func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { mappings := f.mappings.Load().([]uintptr) for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { chunk := int(chunkStart >> chunkShift) @@ -826,6 +841,11 @@ func (f *MemoryFile) UpdateUsage() error { log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") return nil } + // Linux updates usage values at CONFIG_HZ. + if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC { + log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter) + return nil + } f.usageLast = time.Now() err = f.updateUsageLocked(currentUsage, mincore) @@ -839,7 +859,7 @@ func (f *MemoryFile) UpdateUsage() error { // pages by invoking checkCommitted, which is a function that, for each page i // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise. // -// Precondition: f.mu must be held. +// Precondition: f.mu must be held; it may be unlocked and reacquired. func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error { // Track if anything changed to elide the merge. In the common case, we // expect all segments to be committed and no merge to occur. @@ -866,7 +886,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( } else if f.usageSwapped != 0 { // We have more usage accounted for than the file itself. // That's fine, we probably caught a race where pages were - // being committed while the above loop was running. Just + // being committed while the below loop was running. Just // report the higher number that we found and ignore swap. usage.MemoryAccounting.Dec(f.usageSwapped, usage.System) f.usageSwapped = 0 @@ -878,21 +898,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( // Iterate over all usage data. There will only be usage segments // present when there is an associated reference. - for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - val := seg.Value() - - // Already known to be committed; ignore. - if val.knownCommitted { - continue - } - - // Assume that reclaimable pages (that aren't already known to be - // committed) are not committed. This isn't necessarily true, even - // after the reclaimer does Decommit(), because the kernel may - // subsequently back the hugepage-sized region containing the - // decommitted page with a hugepage. However, it's consistent with our - // treatment of unallocated pages, which have the same property. - if val.refs == 0 { + for seg := f.usage.FirstSegment(); seg.Ok(); { + if !seg.ValuePtr().canCommit() { + seg = seg.NextSegment() continue } @@ -915,56 +923,53 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( } // Query for new pages in core. - if err := checkCommitted(s, buf); err != nil { + // NOTE(b/165896008): mincore (which is passed as checkCommitted) + // by f.UpdateUsage() might take a really long time. So unlock f.mu + // while checkCommitted runs. + f.mu.Unlock() + err := checkCommitted(s, buf) + f.mu.Lock() + if err != nil { checkErr = err return } // Scan each page and switch out segments. - populatedRun := false - populatedRunStart := 0 - for i := 0; i <= bufLen; i++ { - // We run past the end of the slice here to - // simplify the logic and only set populated if - // we're still looking at elements. - populated := false - if i < bufLen { - populated = buf[i]&0x1 != 0 - } - - switch { - case populated == populatedRun: - // Keep the run going. - continue - case populated && !populatedRun: - // Begin the run. - populatedRun = true - populatedRunStart = i - // Keep going. + seg := f.usage.LowerBoundSegment(r.Start) + for i := 0; i < bufLen; { + if buf[i]&0x1 == 0 { + i++ continue - case !populated && populatedRun: - // Finish the run by changing this segment. - runRange := platform.FileRange{ - Start: r.Start + uint64(populatedRunStart*usermem.PageSize), - End: r.Start + uint64(i*usermem.PageSize), + } + // Scan to the end of this committed range. + j := i + 1 + for ; j < bufLen; j++ { + if buf[j]&0x1 == 0 { + break } - seg = f.usage.Isolate(seg, runRange) - seg.ValuePtr().knownCommitted = true - // Advance the segment only if we still - // have work to do in the context of - // the original segment from the for - // loop. Otherwise, the for loop itself - // will advance the segment - // appropriately. - if runRange.End != r.End { - seg = seg.NextSegment() + } + committedFR := memmap.FileRange{ + Start: r.Start + uint64(i*usermem.PageSize), + End: r.Start + uint64(j*usermem.PageSize), + } + // Advance seg to committedFR.Start. + for seg.Ok() && seg.End() < committedFR.Start { + seg = seg.NextSegment() + } + // Mark pages overlapping committedFR as committed. + for seg.Ok() && seg.Start() < committedFR.End { + if seg.ValuePtr().canCommit() { + seg = f.usage.Isolate(seg, committedFR) + seg.ValuePtr().knownCommitted = true + amount := seg.Range().Length() + usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind) + f.usageExpected += amount + changedAny = true } - amount := runRange.Length() - usage.MemoryAccounting.Inc(amount, val.kind) - f.usageExpected += amount - changedAny = true - populatedRun = false + seg = seg.NextSegment() } + // Continue scanning for committed pages. + i = j + 1 } // Advance r.Start. @@ -976,6 +981,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( if err != nil { return err } + + // Continue with the first segment after r.End. + seg = f.usage.LowerBoundSegment(r.End) } return nil @@ -1009,7 +1017,7 @@ func (f *MemoryFile) File() *os.File { return f.file } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (f *MemoryFile) FD() int { return int(f.file.Fd()) } @@ -1030,6 +1038,7 @@ func (f *MemoryFile) String() string { // for allocation. func (f *MemoryFile) runReclaim() { for { + // N.B. We must call f.markReclaimed on the returned FrameRange. fr, ok := f.findReclaimable() if !ok { break @@ -1085,13 +1094,17 @@ func (f *MemoryFile) runReclaim() { } } -func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { +// findReclaimable finds memory that has been marked for reclaim. +// +// Note that there returned range will be removed from tracking. It +// must be reclaimed (removed from f.usage) at this point. +func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { f.mu.Lock() defer f.mu.Unlock() for { for { if f.destroyed { - return platform.FileRange{}, false + return memmap.FileRange{}, false } if f.reclaimable { break @@ -1103,27 +1116,24 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { } f.reclaimCond.Wait() } - // Allocate returns the first usable range in offset order and is - // currently a linear scan, so reclaiming from the beginning of the - // file minimizes the expected latency of Allocate. - for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { - if seg.ValuePtr().refs == 0 { - f.minReclaimablePage = seg.End() - return seg.Range(), true - } + // Allocate works from the back of the file inwards, so reclaim + // preserves this order to minimize the cost of the search. + if seg := f.reclaim.LastSegment(); seg.Ok() { + fr := seg.Range() + f.reclaim.Remove(seg) + return fr, true } - // No pages are reclaimable. + // Nothing is reclaimable. f.reclaimable = false - f.minReclaimablePage = maxPage } } -func (f *MemoryFile) markReclaimed(fr platform.FileRange) { +func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() seg := f.usage.FindSegment(fr.Start) - // All of fr should be mapped to a single uncommitted reclaimable segment - // accounted to System. + // All of fr should be mapped to a single uncommitted reclaimable + // segment accounted to System. if !seg.Ok() { panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) } @@ -1137,14 +1147,10 @@ func (f *MemoryFile) markReclaimed(fr platform.FileRange) { }); got != want { panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) } - // Deallocate reclaimed pages. Even though all of seg is reclaimable, the - // caller of markReclaimed may not have decommitted it, so we can only mark - // fr as reclaimed. + // Deallocate reclaimed pages. Even though all of seg is reclaimable, + // the caller of markReclaimed may not have decommitted it, so we can + // only mark fr as reclaimed. f.usage.Remove(f.usage.Isolate(seg, fr)) - if fr.Start < f.minUnallocatedPage { - // We've deallocated at least one lower page. - f.minUnallocatedPage = fr.Start - } } // StartEvictions requests that f evict all evictable allocations. It does not @@ -1169,8 +1175,10 @@ func (f *MemoryFile) startEvictionsLocked() bool { return startedAny } -// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be -// locked. +// Preconditions: +// * info == f.evictable[user]. +// * !info.evicting. +// * f.mu must be locked. func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { info.evicting = true f.evictionWG.Add(1) @@ -1224,11 +1232,11 @@ func (usageSetFunctions) MaxKey() uint64 { func (usageSetFunctions) ClearValue(val *usageInfo) { } -func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) { +func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { return val1, val1 == val2 } -func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { +func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { return val, val } @@ -1255,3 +1263,27 @@ func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetVal func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { return evictableRangeSetValue{}, evictableRangeSetValue{} } + +// reclaimSetValue is the value type of reclaimSet. +type reclaimSetValue struct{} + +type reclaimSetFunctions struct{} + +func (reclaimSetFunctions) MinKey() uint64 { + return 0 +} + +func (reclaimSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { +} + +func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { + return reclaimSetValue{}, true +} + +func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { + return reclaimSetValue{}, reclaimSetValue{} +} diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go index 293f22c6b..405db141f 100644 --- a/pkg/sentry/pgalloc/pgalloc_test.go +++ b/pkg/sentry/pgalloc/pgalloc_test.go @@ -23,39 +23,49 @@ import ( const ( page = usermem.PageSize hugepage = usermem.HugePageSize + topPage = (1 << 63) - page ) func TestFindUnallocatedRange(t *testing.T) { for _, test := range []struct { - desc string - usage *usageSegmentDataSlices - start uint64 - length uint64 - alignment uint64 - unallocated uint64 - minUnallocated uint64 + desc string + usage *usageSegmentDataSlices + fileSize int64 + length uint64 + alignment uint64 + start uint64 + expectFail bool }{ { - desc: "Initial allocation succeeds", - usage: &usageSegmentDataSlices{}, - start: 0, - length: page, - alignment: page, - unallocated: 0, - minUnallocated: 0, + desc: "Initial allocation succeeds", + usage: &usageSegmentDataSlices{}, + length: page, + alignment: page, + start: chunkSize - page, // Grows by chunkSize, allocate down. }, { - desc: "Allocation begins at start of file", + desc: "Allocation finds empty space at start of file", usage: &usageSegmentDataSlices{ Start: []uint64{page}, End: []uint64{2 * page}, Values: []usageInfo{{refs: 1}}, }, - start: 0, - length: page, - alignment: page, - unallocated: 0, - minUnallocated: 0, + fileSize: 2 * page, + length: page, + alignment: page, + start: 0, + }, + { + desc: "Allocation finds empty space at end of file", + usage: &usageSegmentDataSlices{ + Start: []uint64{0}, + End: []uint64{page}, + Values: []usageInfo{{refs: 1}}, + }, + fileSize: 2 * page, + length: page, + alignment: page, + start: page, }, { desc: "In-use frames are not allocatable", @@ -64,11 +74,10 @@ func TestFindUnallocatedRange(t *testing.T) { End: []uint64{page, 2 * page}, Values: []usageInfo{{refs: 1}, {refs: 2}}, }, - start: 0, - length: page, - alignment: page, - unallocated: 2 * page, - minUnallocated: 2 * page, + fileSize: 2 * page, + length: page, + alignment: page, + start: 3 * page, // Double fileSize, allocate top-down. }, { desc: "Reclaimable frames are not allocatable", @@ -77,11 +86,10 @@ func TestFindUnallocatedRange(t *testing.T) { End: []uint64{page, 2 * page, 3 * page}, Values: []usageInfo{{refs: 1}, {refs: 0}, {refs: 1}}, }, - start: 0, - length: page, - alignment: page, - unallocated: 3 * page, - minUnallocated: 3 * page, + fileSize: 3 * page, + length: page, + alignment: page, + start: 5 * page, // Double fileSize, grow down. }, { desc: "Gaps between in-use frames are allocatable", @@ -90,11 +98,10 @@ func TestFindUnallocatedRange(t *testing.T) { End: []uint64{page, 3 * page}, Values: []usageInfo{{refs: 1}, {refs: 1}}, }, - start: 0, - length: page, - alignment: page, - unallocated: page, - minUnallocated: page, + fileSize: 3 * page, + length: page, + alignment: page, + start: page, }, { desc: "Inadequately-sized gaps are rejected", @@ -103,14 +110,13 @@ func TestFindUnallocatedRange(t *testing.T) { End: []uint64{page, 3 * page}, Values: []usageInfo{{refs: 1}, {refs: 1}}, }, - start: 0, - length: 2 * page, - alignment: page, - unallocated: 3 * page, - minUnallocated: page, + fileSize: 3 * page, + length: 2 * page, + alignment: page, + start: 4 * page, // Double fileSize, grow down. }, { - desc: "Hugepage alignment is honored", + desc: "Alignment is honored at end of file", usage: &usageSegmentDataSlices{ Start: []uint64{0, hugepage + page}, // Hugepage-sized gap here that shouldn't be allocated from @@ -118,37 +124,103 @@ func TestFindUnallocatedRange(t *testing.T) { End: []uint64{page, hugepage + 2*page}, Values: []usageInfo{{refs: 1}, {refs: 1}}, }, - start: 0, - length: hugepage, - alignment: hugepage, - unallocated: 2 * hugepage, - minUnallocated: page, + fileSize: hugepage + 2*page, + length: hugepage, + alignment: hugepage, + start: 3 * hugepage, // Double fileSize until alignment is satisfied, grow down. }, { - desc: "Pages before start ignored", + desc: "Alignment is honored before end of file", + usage: &usageSegmentDataSlices{ + Start: []uint64{0, 2*hugepage + page}, + // Page will need to be shifted down from top. + End: []uint64{page, 2*hugepage + 2*page}, + Values: []usageInfo{{refs: 1}, {refs: 1}}, + }, + fileSize: 2*hugepage + 2*page, + length: hugepage, + alignment: hugepage, + start: hugepage, + }, + { + desc: "Allocation doubles file size more than once if necessary", + usage: &usageSegmentDataSlices{}, + fileSize: page, + length: 4 * page, + alignment: page, + start: 0, + }, + { + desc: "Allocations are compact if possible", usage: &usageSegmentDataSlices{ Start: []uint64{page, 3 * page}, End: []uint64{2 * page, 4 * page}, Values: []usageInfo{{refs: 1}, {refs: 2}}, }, - start: page, - length: page, - alignment: page, - unallocated: 2 * page, - minUnallocated: 2 * page, + fileSize: 4 * page, + length: page, + alignment: page, + start: 2 * page, + }, + { + desc: "Top-down allocation within one gap", + usage: &usageSegmentDataSlices{ + Start: []uint64{page, 4 * page, 7 * page}, + End: []uint64{2 * page, 5 * page, 8 * page}, + Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}}, + }, + fileSize: 8 * page, + length: page, + alignment: page, + start: 6 * page, + }, + { + desc: "Top-down allocation between multiple gaps", + usage: &usageSegmentDataSlices{ + Start: []uint64{page, 3 * page, 5 * page}, + End: []uint64{2 * page, 4 * page, 6 * page}, + Values: []usageInfo{{refs: 1}, {refs: 2}, {refs: 1}}, + }, + fileSize: 6 * page, + length: page, + alignment: page, + start: 4 * page, }, { - desc: "start may be in the middle of segment", + desc: "Top-down allocation with large top gap", usage: &usageSegmentDataSlices{ - Start: []uint64{0, 3 * page}, + Start: []uint64{page, 3 * page}, End: []uint64{2 * page, 4 * page}, Values: []usageInfo{{refs: 1}, {refs: 2}}, }, - start: page, - length: page, - alignment: page, - unallocated: 2 * page, - minUnallocated: 2 * page, + fileSize: 8 * page, + length: page, + alignment: page, + start: 7 * page, + }, + { + desc: "Gaps found with possible overflow", + usage: &usageSegmentDataSlices{ + Start: []uint64{page, topPage - page}, + End: []uint64{2 * page, topPage}, + Values: []usageInfo{{refs: 1}, {refs: 1}}, + }, + fileSize: topPage, + length: page, + alignment: page, + start: topPage - 2*page, + }, + { + desc: "Overflow detected", + usage: &usageSegmentDataSlices{ + Start: []uint64{page}, + End: []uint64{topPage}, + Values: []usageInfo{{refs: 1}}, + }, + fileSize: topPage, + length: 2 * page, + alignment: page, + expectFail: true, }, } { t.Run(test.desc, func(t *testing.T) { @@ -156,12 +228,18 @@ func TestFindUnallocatedRange(t *testing.T) { if err := usage.ImportSortedSlices(test.usage); err != nil { t.Fatalf("Failed to initialize usage from %v: %v", test.usage, err) } - unallocated, minUnallocated := findUnallocatedRange(&usage, test.start, test.length, test.alignment) - if unallocated != test.unallocated { - t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got unallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, unallocated, test.unallocated) + fr, ok := findAvailableRange(&usage, test.fileSize, test.length, test.alignment) + if !test.expectFail && !ok { + t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, false wanted %x, true", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start) + } + if test.expectFail && ok { + t.Fatalf("findAvailableRange(%v, %x, %x, %x): got %x, true wanted %x, false", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start) + } + if ok && fr.Start != test.start { + t.Errorf("findAvailableRange(%v, %x, %x, %x): got start=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.Start, test.start) } - if minUnallocated != test.minUnallocated { - t.Errorf("findUnallocatedRange(%v, %x, %x, %x): got minUnallocated %x, wanted %x", test.usage, test.start, test.length, test.alignment, minUnallocated, test.minUnallocated) + if ok && fr.End != test.start+test.length { + t.Errorf("findAvailableRange(%v, %x, %x, %x): got end=%x, wanted %x", test.usage, test.fileSize, test.length, test.alignment, fr.End, test.start+test.length) } }) } diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index f8385c146..78317fa35 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -26,11 +26,12 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/state/wire" "gvisor.dev/gvisor/pkg/usermem" ) // SaveTo writes f's state to the given stream. -func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { +func (f *MemoryFile) SaveTo(ctx context.Context, w wire.Writer) error { // Wait for reclaim. f.mu.Lock() defer f.mu.Unlock() @@ -79,10 +80,10 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { } // Save metadata. - if err := state.Save(ctx, w, &f.fileSize, nil); err != nil { + if _, err := state.Save(ctx, w, &f.fileSize); err != nil { return err } - if err := state.Save(ctx, w, &f.usage, nil); err != nil { + if _, err := state.Save(ctx, w, &f.usage); err != nil { return err } @@ -115,9 +116,9 @@ func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer) error { } // LoadFrom loads MemoryFile state from the given stream. -func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error { +func (f *MemoryFile) LoadFrom(ctx context.Context, r wire.Reader) error { // Load metadata. - if err := state.Load(ctx, r, &f.fileSize, nil); err != nil { + if _, err := state.Load(ctx, r, &f.fileSize); err != nil { return err } if err := f.file.Truncate(f.fileSize); err != nil { @@ -125,7 +126,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader) error { } newMappings := make([]uintptr, f.fileSize>>chunkShift) f.mappings.Store(newMappings) - if err := state.Load(ctx, r, &f.usage, nil); err != nil { + if _, err := state.Load(ctx, r, &f.usage); err != nil { return err } diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 453241eca..db7d55ef2 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,39 +1,22 @@ load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -go_template_instance( - name = "file_range", - out = "file_range.go", - package = "platform", - prefix = "File", - template = "//pkg/segment:generic_range", - types = { - "T": "uint64", - }, -) - go_library( name = "platform", srcs = [ "context.go", - "file_range.go", "mmap_min_addr.go", "platform.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/atomicbitops", "//pkg/context", - "//pkg/log", - "//pkg/safecopy", - "//pkg/safemem", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/usage", - "//pkg/syserror", + "//pkg/sentry/hostmm", + "//pkg/sentry/memmap", "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go index 57be41647..9dfac3eae 100644 --- a/pkg/sentry/platform/interrupt/interrupt.go +++ b/pkg/sentry/platform/interrupt/interrupt.go @@ -54,8 +54,9 @@ type Forwarder struct { // } // defer f.Disable() // -// Preconditions: r must not be nil. f must not already be forwarding -// interrupts to a Receiver. +// Preconditions: +// * r must not be nil. +// * f must not already be forwarding interrupts to a Receiver. func (f *Forwarder) Enable(r Receiver) bool { if r == nil { panic("nil Receiver") diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 159f7eafd..8ce411102 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -6,15 +6,15 @@ go_library( name = "kvm", srcs = [ "address_space.go", - "allocator.go", "bluepill.go", + "bluepill_allocator.go", "bluepill_amd64.go", - "bluepill_amd64.s", "bluepill_amd64_unsafe.go", "bluepill_arm64.go", "bluepill_arm64.s", "bluepill_arm64_unsafe.go", "bluepill_fault.go", + "bluepill_impl_amd64.s", "bluepill_unsafe.go", "context.go", "filters_amd64.go", @@ -41,12 +41,14 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/cpuid", "//pkg/log", "//pkg/procid", "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/ring0", @@ -54,12 +56,15 @@ go_library( "//pkg/sentry/time", "//pkg/sync", "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", ], ) go_test( name = "kvm_test", srcs = [ + "kvm_amd64_test.go", + "kvm_arm64_test.go", "kvm_test.go", "virtual_map_test.go", ], @@ -75,6 +80,15 @@ go_test( "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/platform/ring0", "//pkg/sentry/platform/ring0/pagetables", + "//pkg/sentry/time", "//pkg/usermem", ], ) + +genrule( + name = "bluepill_impl_amd64", + srcs = ["bluepill_amd64.s"], + outs = ["bluepill_impl_amd64.s"], + cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", + tools = ["//pkg/sentry/platform/ring0/gen_offsets"], +) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index be213bfe8..af5c5e191 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sync" @@ -26,16 +27,15 @@ import ( // dirtySet tracks vCPUs for invalidation. type dirtySet struct { - vCPUs []uint64 + vCPUMasks []uint64 } // forEach iterates over all CPUs in the dirty set. +// +//go:nosplit func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { - m.mu.RLock() - defer m.mu.RUnlock() - - for index := range ds.vCPUs { - mask := atomic.SwapUint64(&ds.vCPUs[index], 0) + for index := range ds.vCPUMasks { + mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0) if mask != 0 { for bit := 0; bit < 64; bit++ { if mask&(1<<uint64(bit)) == 0 { @@ -54,7 +54,7 @@ func (ds *dirtySet) mark(c *vCPU) bool { index := uint64(c.id) / 64 bit := uint64(1) << uint(c.id%64) - oldValue := atomic.LoadUint64(&ds.vCPUs[index]) + oldValue := atomic.LoadUint64(&ds.vCPUMasks[index]) if oldValue&bit != 0 { return false // Not clean. } @@ -62,7 +62,7 @@ func (ds *dirtySet) mark(c *vCPU) bool { // Set the bit unilaterally, and ensure that a flush takes place. Note // that it's possible for races to occur here, but since the flush is // taking place long after these lines there's no race in practice. - atomicbitops.OrUint64(&ds.vCPUs[index], bit) + atomicbitops.OrUint64(&ds.vCPUMasks[index], bit) return true // Previously clean. } @@ -113,7 +113,12 @@ type hostMapEntry struct { length uintptr } -func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) { +// mapLocked maps the given host entry. +// +// +checkescape:hard,stack +// +//go:nosplit +func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) { for m.length > 0 { physical, length, ok := translateToPhysical(m.addr) if !ok { @@ -133,18 +138,10 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac // important; if the pagetable mappings were installed before // ensuring the physical pages were available, then some other // thread could theoretically access them. - // - // Due to the way KVM's shadow paging implementation works, - // modifications to the page tables while in host mode may not - // be trapped, leading to the shadow pages being out of sync. - // Therefore, we need to ensure that we are in guest mode for - // page table modifications. See the call to bluepill, below. - as.machine.retryInGuest(func() { - inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ - AccessType: at, - User: true, - }, physical) || inv - }) + inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ + AccessType: at, + User: true, + }, physical) || inv m.addr += length m.length -= length addr += usermem.Addr(length) @@ -154,7 +151,7 @@ func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.Ac } // MapFile implements platform.AddressSpace.MapFile. -func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { as.mu.Lock() defer as.mu.Unlock() @@ -176,6 +173,10 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform. return err } + // See block in mapLocked. + as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() + defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) + // Map the mappings in the sentry's address space (guest physical memory) // into the application's address space (guest virtual memory). inv := false @@ -190,7 +191,12 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform. _ = s[i] // Touch to commit. } } - prev := as.mapHost(addr, hostMapEntry{ + + // See bluepill_allocator.go. + bluepill(as.pageTables.Allocator.(*allocator).cpu) + + // Perform the mapping. + prev := as.mapLocked(addr, hostMapEntry{ addr: b.Addr(), length: uintptr(b.Len()), }, at) @@ -204,17 +210,27 @@ func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform. return nil } +// unmapLocked is an escape-checked wrapped around Unmap. +// +// +checkescape:hard,stack +// +//go:nosplit +func (as *addressSpace) unmapLocked(addr usermem.Addr, length uint64) bool { + return as.pageTables.Unmap(addr, uintptr(length)) +} + // Unmap unmaps the given range by calling pagetables.PageTables.Unmap. func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) { as.mu.Lock() defer as.mu.Unlock() - // See above re: retryInGuest. - var prev bool - as.machine.retryInGuest(func() { - prev = as.pageTables.Unmap(addr, uintptr(length)) || prev - }) - if prev { + // See above & bluepill_allocator.go. + as.pageTables.Allocator.(*allocator).cpu = as.machine.Get() + defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu) + bluepill(as.pageTables.Allocator.(*allocator).cpu) + + if prev := as.unmapLocked(addr, length); prev { + // Invalidate all active vCPUs. as.invalidate() // Recycle any freed intermediate pages. @@ -227,8 +243,14 @@ func (as *addressSpace) Release() { as.Unmap(0, ^uint64(0)) // Free all pages from the allocator. - as.pageTables.Allocator.(allocator).base.Drain() + as.pageTables.Allocator.(*allocator).base.Drain() // Drop all cached machine references. as.machine.dropPageTables(as.pageTables) } + +// PreFork implements platform.AddressSpace.PreFork. +func (as *addressSpace) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (as *addressSpace) PostFork() {} diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/bluepill_allocator.go index 3f35414bb..9485e1301 100644 --- a/pkg/sentry/platform/kvm/allocator.go +++ b/pkg/sentry/platform/kvm/bluepill_allocator.go @@ -21,56 +21,80 @@ import ( ) type allocator struct { - base *pagetables.RuntimeAllocator + base pagetables.RuntimeAllocator + + // cpu must be set prior to any pagetable operation. + // + // Due to the way KVM's shadow paging implementation works, + // modifications to the page tables while in host mode may not be + // trapped, leading to the shadow pages being out of sync. Therefore, + // we need to ensure that we are in guest mode for page table + // modifications. See the call to bluepill, below. + cpu *vCPU } // newAllocator is used to define the allocator. -func newAllocator() allocator { - return allocator{ - base: pagetables.NewRuntimeAllocator(), - } +func newAllocator() *allocator { + a := new(allocator) + a.base.Init() + return a } // NewPTEs implements pagetables.Allocator.NewPTEs. // +// +checkescape:all +// //go:nosplit -func (a allocator) NewPTEs() *pagetables.PTEs { - return a.base.NewPTEs() +func (a *allocator) NewPTEs() *pagetables.PTEs { + ptes := a.base.NewPTEs() // escapes: bluepill below. + if a.cpu != nil { + bluepill(a.cpu) + } + return ptes } // PhysicalFor returns the physical address for a set of PTEs. // +// +checkescape:all +// //go:nosplit -func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr { +func (a *allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr { virtual := a.base.PhysicalFor(ptes) physical, _, ok := translateToPhysical(virtual) if !ok { - panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) + panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) // escapes: panic. } return physical } // LookupPTEs implements pagetables.Allocator.LookupPTEs. // +// +checkescape:all +// //go:nosplit -func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs { +func (a *allocator) LookupPTEs(physical uintptr) *pagetables.PTEs { virtualStart, physicalStart, _, ok := calculateBluepillFault(physical, physicalRegions) if !ok { - panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) + panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) // escapes: panic. } return a.base.LookupPTEs(virtualStart + (physical - physicalStart)) } // FreePTEs implements pagetables.Allocator.FreePTEs. // +// +checkescape:all +// //go:nosplit -func (a allocator) FreePTEs(ptes *pagetables.PTEs) { - a.base.FreePTEs(ptes) +func (a *allocator) FreePTEs(ptes *pagetables.PTEs) { + a.base.FreePTEs(ptes) // escapes: bluepill below. + if a.cpu != nil { + bluepill(a.cpu) + } } // Recycle implements pagetables.Allocator.Recycle. // //go:nosplit -func (a allocator) Recycle() { +func (a *allocator) Recycle() { a.base.Recycle() } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index 133c2203d..ddc1554d5 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -63,6 +63,8 @@ func bluepillArchEnter(context *arch.SignalContext64) *vCPU { // KernelSyscall handles kernel syscalls. // +// +checkescape:all +// //go:nosplit func (c *vCPU) KernelSyscall() { regs := c.Registers() @@ -72,13 +74,15 @@ func (c *vCPU) KernelSyscall() { // We only trigger a bluepill entry in the bluepill function, and can // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. We only worry about saving on exit. - ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) + ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no. ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment. + ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment. } // KernelException handles kernel exceptions. // +// +checkescape:all +// //go:nosplit func (c *vCPU) KernelException(vector ring0.Vector) { regs := c.Registers() @@ -89,9 +93,9 @@ func (c *vCPU) KernelException(vector ring0.Vector) { regs.Rip = 0 } // See above. - ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) + ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no. ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment. + ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment. } // bluepillArchExit is called during bluepillEnter. diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s index 2bc34a435..025ea93b5 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -19,11 +19,6 @@ // This is guaranteed to be zero. #define VCPU_CPU 0x0 -// CPU_SELF is the self reference in ring0's percpu. -// -// This is guaranteed to be zero. -#define CPU_SELF 0x0 - // Context offsets. // // Only limited use of the context is done in the assembly stub below, most is @@ -44,7 +39,7 @@ begin: LEAQ VCPU_CPU(AX), BX BYTE CLI; check_vcpu: - MOVQ CPU_SELF(GS), CX + MOVQ ENTRY_CPU_SELF(GS), CX CMPQ BX, CX JE right_vCPU wrong_vcpu: diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go index 99cac665d..0a54dd30d 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -17,6 +17,7 @@ package kvm import ( + "syscall" "unsafe" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -53,3 +54,63 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { context.Rbx = uint64(uintptr(unsafe.Pointer(c))) context.Rip = uint64(dieTrampolineAddr) } + +// getHypercallID returns hypercall ID. +// +//go:nosplit +func getHypercallID(addr uintptr) int { + return _KVM_HYPERCALL_MAX +} + +// bluepillStopGuest is reponsible for injecting interrupt. +// +//go:nosplit +func bluepillStopGuest(c *vCPU) { + // Interrupt: we must have requested an interrupt + // window; set the interrupt line. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_INTERRUPT, + uintptr(unsafe.Pointer(&bounce))); errno != 0 { + throw("interrupt injection failed") + } + // Clear previous injection request. + c.runData.requestInterruptWindow = 0 +} + +// bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection. +// +//go:nosplit +func bluepillReadyStopGuest(c *vCPU) bool { + if c.runData.readyForInterruptInjection == 0 { + return false + } + + if c.runData.ifFlag == 0 { + // This is impossible if readyForInterruptInjection is 1. + throw("interrupts are disabled") + } + + // Disable interrupts if we are in the kernel space. + // + // When the Sentry switches into the kernel mode, it disables + // interrupts. But when goruntime switches on a goroutine which has + // been saved in the host mode, it restores flags and this enables + // interrupts. See the comment of UserFlagsSet for more details. + uregs := userRegs{} + err := c.getUserRegisters(&uregs) + if err != 0 { + throw("failed to get user registers") + } + + if ring0.IsKernelFlags(uregs.RFLAGS) { + uregs.RFLAGS &^= ring0.KernelFlagsClear + err = c.setUserRegisters(&uregs) + if err != 0 { + throw("failed to set user registers") + } + return false + } + return true +} diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index c215d443c..58f3d6fdd 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -26,8 +26,29 @@ import ( var ( // The action for bluepillSignal is changed by sigaction(). bluepillSignal = syscall.SIGILL + + // vcpuSErr is the event of system error. + vcpuSErr = kvmVcpuEvents{ + exception: exception{ + sErrPending: 1, + sErrHasEsr: 0, + pad: [6]uint8{0, 0, 0, 0, 0, 0}, + sErrEsr: 1, + }, + rsvd: [12]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + } ) +// getTLS returns the value of TPIDR_EL0 register. +// +//go:nosplit +func getTLS() (value uint64) + +// setTLS writes the TPIDR_EL0 value. +// +//go:nosplit +func setTLS(value uint64) + // bluepillArchEnter is called during bluepillEnter. // //go:nosplit @@ -38,8 +59,10 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { regs.Sp = context.Sp regs.Pc = context.Pc regs.Pstate = context.Pstate - regs.Pstate &^= uint64(ring0.KernelFlagsClear) + regs.Pstate &^= uint64(ring0.PsrFlagsClear) regs.Pstate |= ring0.KernelFlagsSet + regs.TPIDR_EL0 = getTLS() + return } @@ -52,8 +75,9 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { context.Sp = regs.Sp context.Pc = regs.Pc context.Pstate = regs.Pstate - context.Pstate &^= uint64(ring0.UserFlagsClear) + context.Pstate &^= uint64(ring0.PsrFlagsClear) context.Pstate |= ring0.UserFlagsSet + setTLS(regs.TPIDR_EL0) lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { @@ -66,6 +90,8 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { // KernelSyscall handles kernel syscalls. // +// +checkescape:all +// //go:nosplit func (c *vCPU) KernelSyscall() { regs := c.Registers() @@ -88,6 +114,8 @@ func (c *vCPU) KernelSyscall() { // KernelException handles kernel exceptions. // +// +checkescape:all +// //go:nosplit func (c *vCPU) KernelException(vector ring0.Vector) { regs := c.Registers() diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s index 04efa0147..09c7e88e5 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.s +++ b/pkg/sentry/platform/kvm/bluepill_arm64.s @@ -32,6 +32,18 @@ #define CONTEXT_PC 0x1B8 #define CONTEXT_R0 0xB8 +// getTLS returns the value of TPIDR_EL0 register. +TEXT ·getTLS(SB),NOSPLIT,$0-8 + MRS TPIDR_EL0, R1 + MOVD R1, ret+0(FP) + RET + +// setTLS writes the TPIDR_EL0 value. +TEXT ·setTLS(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + MSR R1, TPIDR_EL0 + RET + // See bluepill.go. TEXT ·bluepill(SB),NOSPLIT,$0 begin: diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index 4ca2b7717..b35c930e2 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -17,6 +17,7 @@ package kvm import ( + "syscall" "unsafe" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -40,7 +41,7 @@ func fpsimdPtr(addr *byte) *arch.FpsimdContext { func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { // If the vCPU is in user mode, we set the stack to the stored stack // value in the vCPU itself. We don't want to unwind the user stack. - if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t { + if guestRegs.Regs.Pstate&ring0.PsrModeMask == ring0.UserFlagsSet { regs := c.CPU.Registers() context.Regs[0] = regs.Regs[0] context.Sp = regs.Sp @@ -61,3 +62,36 @@ func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { func bluepillArchFpContext(context unsafe.Pointer) *arch.FpsimdContext { return &((*arch.SignalContext64)(context).Fpsimd64) } + +// getHypercallID returns hypercall ID. +// +// On Arm64, the MMIO address should be 64-bit aligned. +// +//go:nosplit +func getHypercallID(addr uintptr) int { + if addr < arm64HypercallMMIOBase || addr >= (arm64HypercallMMIOBase+_AARCH64_HYPERCALL_MMIO_SIZE) { + return _KVM_HYPERCALL_MAX + } else { + return int(((addr) - arm64HypercallMMIOBase) >> 3) + } +} + +// bluepillStopGuest is reponsible for injecting sError. +// +//go:nosplit +func bluepillStopGuest(c *vCPU) { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_VCPU_EVENTS, + uintptr(unsafe.Pointer(&vcpuSErr))); errno != 0 { + throw("sErr injection failed") + } +} + +// bluepillReadyStopGuest checks whether the current vCPU is ready for sError injection. +// +//go:nosplit +func bluepillReadyStopGuest(c *vCPU) bool { + return true +} diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index e34f46aeb..a182e4f22 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -98,6 +98,10 @@ func handleBluepillFault(m *machine, physical uintptr, phyRegions []physicalRegi } errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart, flags) if errno == 0 { + // Store the physical address in the slot. This is used to + // avoid calls to handleBluepillFault in the future (see + // machine.mapPhysical). + atomic.StoreUintptr(&m.usedSlots[slot], physical) // Successfully added region; we can increment nextSlot and // allow another set to proceed here. atomic.StoreUint32(&m.nextSlot, slot+1) diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 2407014e9..eb05950cd 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.16 +// +build !go1.17 // Check go:linkname function signatures when updating Go version. @@ -58,20 +58,40 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { return &((*arch.UContext64)(context).MContext) } +// bluepillHandleHlt is reponsible for handling VM-Exit. +// +//go:nosplit +func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { + // Increment our counter. + atomic.AddUint64(&c.guestExits, 1) + + // Copy out registers. + bluepillArchExit(c, bluepillArchContext(context)) + + // Return to the vCPUReady state; notify any waiters. + user := atomic.LoadUint32(&c.state) & vCPUUser + switch atomic.SwapUint32(&c.state, user) { + case user | vCPUGuest: // Expected case. + case user | vCPUGuest | vCPUWaiter: + c.notify() + default: + throw("invalid state") + } +} + // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the // signal stack. It should only execute raw system calls and functions that are // explicitly marked go:nosplit. // +// +checkescape:all +// //go:nosplit func bluepillHandler(context unsafe.Pointer) { // Sanitize the registers; interrupts must always be disabled. c := bluepillArchEnter(bluepillArchContext(context)) - // Increment the number of switches. - atomic.AddUint32(&c.switches, 1) - // Mark this as guest mode. switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) { case vCPUUser: // Expected case. @@ -82,7 +102,8 @@ func bluepillHandler(context unsafe.Pointer) { } for { - switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno { + _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no. + switch errno { case 0: // Expected case. case syscall.EINTR: // First, we process whatever pending signal @@ -90,7 +111,7 @@ func bluepillHandler(context unsafe.Pointer) { // currently, all signals are masked and the signal // must have been delivered directly to this thread. timeout := syscall.Timespec{} - sig, _, errno := syscall.RawSyscall6( + sig, _, errno := syscall.RawSyscall6( // escapes: no. syscall.SYS_RT_SIGTIMEDWAIT, uintptr(unsafe.Pointer(&bounceSignalMask)), 0, // siginfo. @@ -112,12 +133,12 @@ func bluepillHandler(context unsafe.Pointer) { // PIC, we can't inject an interrupt while they are // masked. We need to request a window if it's not // ready. - if c.runData.readyForInterruptInjection == 0 { - c.runData.requestInterruptWindow = 1 - continue // Rerun vCPU. - } else { + if bluepillReadyStopGuest(c) { // Force injection below; the vCPU is ready. c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN + } else { + c.runData.requestInterruptWindow = 1 + continue // Rerun vCPU. } case syscall.EFAULT: // If a fault is not serviceable due to the host @@ -125,7 +146,7 @@ func bluepillHandler(context unsafe.Pointer) { // MMIO exit we receive EFAULT from the run ioctl. We // always inject an NMI here since we may be in kernel // mode and have interrupts disabled. - if _, _, errno := syscall.RawSyscall( + if _, _, errno := syscall.RawSyscall( // escapes: no. syscall.SYS_IOCTL, uintptr(c.fd), _KVM_NMI, 0); errno != 0 { @@ -156,25 +177,20 @@ func bluepillHandler(context unsafe.Pointer) { c.die(bluepillArchContext(context), "debug") return case _KVM_EXIT_HLT: - // Copy out registers. - bluepillArchExit(c, bluepillArchContext(context)) - - // Return to the vCPUReady state; notify any waiters. - user := atomic.LoadUint32(&c.state) & vCPUUser - switch atomic.SwapUint32(&c.state, user) { - case user | vCPUGuest: // Expected case. - case user | vCPUGuest | vCPUWaiter: - c.notify() - default: - throw("invalid state") - } + bluepillGuestExit(c, context) return case _KVM_EXIT_MMIO: + physical := uintptr(c.runData.data[0]) + if getHypercallID(physical) == _KVM_HYPERCALL_VMEXIT { + bluepillGuestExit(c, context) + return + } + // Increment the fault count. atomic.AddUint32(&c.faults, 1) // For MMIO, the physical address is the first data item. - physical := uintptr(c.runData.data[0]) + physical = uintptr(c.runData.data[0]) virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE) if !ok { c.die(bluepillArchContext(context), "invalid physical address") @@ -201,17 +217,7 @@ func bluepillHandler(context unsafe.Pointer) { } } case _KVM_EXIT_IRQ_WINDOW_OPEN: - // Interrupt: we must have requested an interrupt - // window; set the interrupt line. - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_INTERRUPT, - uintptr(unsafe.Pointer(&bounce))); errno != 0 { - throw("interrupt injection failed") - } - // Clear previous injection request. - c.runData.requestInterruptWindow = 0 + bluepillStopGuest(c) case _KVM_EXIT_SHUTDOWN: c.die(bluepillArchContext(context), "shutdown") return diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index 6507121ea..17268d127 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -15,6 +15,9 @@ package kvm import ( + "sync/atomic" + + pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" @@ -37,7 +40,8 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { + as := mm.AddressSpace() localAS := as.(*addressSpace) // Grab a vCPU. @@ -73,6 +77,9 @@ func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*a // Clear the address space. cpu.active.set(nil) + // Increment the number of user exits. + atomic.AddUint64(&cpu.userExits, 1) + // Release resources. c.machine.Put(cpu) @@ -88,3 +95,9 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} + +// FullStateChanged implements platform.Context.FullStateChanged. +func (c *context) FullStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} diff --git a/pkg/sentry/platform/kvm/filters_amd64.go b/pkg/sentry/platform/kvm/filters_amd64.go index 7d949f1dd..d3d216aa5 100644 --- a/pkg/sentry/platform/kvm/filters_amd64.go +++ b/pkg/sentry/platform/kvm/filters_amd64.go @@ -17,14 +17,23 @@ package kvm import ( "syscall" + "golang.org/x/sys/unix" + + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // SyscallFilters returns syscalls made exclusively by the KVM platform. func (*KVM) SyscallFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ - syscall.SYS_ARCH_PRCTL: {}, - syscall.SYS_IOCTL: {}, + syscall.SYS_ARCH_PRCTL: {}, + syscall.SYS_IOCTL: {}, + unix.SYS_MEMBARRIER: []seccomp.Rule{ + { + seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED), + seccomp.EqualTo(0), + }, + }, syscall.SYS_MMAP: {}, syscall.SYS_RT_SIGSUSPEND: {}, syscall.SYS_RT_SIGTIMEDWAIT: {}, diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go index 9245d07c2..21abc2a3d 100644 --- a/pkg/sentry/platform/kvm/filters_arm64.go +++ b/pkg/sentry/platform/kvm/filters_arm64.go @@ -17,13 +17,22 @@ package kvm import ( "syscall" + "golang.org/x/sys/unix" + + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/seccomp" ) // SyscallFilters returns syscalls made exclusively by the KVM platform. func (*KVM) SyscallFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ - syscall.SYS_IOCTL: {}, + syscall.SYS_IOCTL: {}, + unix.SYS_MEMBARRIER: []seccomp.Rule{ + { + seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED), + seccomp.EqualTo(0), + }, + }, syscall.SYS_MMAP: {}, syscall.SYS_RT_SIGSUSPEND: {}, syscall.SYS_RT_SIGTIMEDWAIT: {}, diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index ae813e24e..dd45ad10b 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -63,6 +63,9 @@ type runData struct { type KVM struct { platform.NoCPUPreemptionDetection + // KVM never changes mm_structs. + platform.UseHostProcessMemoryBarrier + // machine is the backing VM. machine *machine } @@ -156,15 +159,7 @@ func (*KVM) MaxUserAddress() usermem.Addr { func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) { // Allocate page tables and install system mappings. pageTables := pagetables.New(newAllocator()) - applyPhysicalRegions(func(pr physicalRegion) bool { - // Map the kernel in the upper half. - pageTables.Map( - usermem.Addr(ring0.KernelStartAddress|pr.virtual), - pr.length, - pagetables.MapOpts{AccessType: usermem.AnyAccess}, - pr.physical) - return true // Keep iterating. - }) + k.machine.mapUpperHalf(pageTables) // Return the new address space. return &addressSpace{ diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go new file mode 100644 index 000000000..c0b4fd374 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -0,0 +1,51 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +func TestSegments(t *testing.T) { + applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTestSegments(regs) + for { + var si arch.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err == platform.ErrContextInterrupt { + continue // Retry. + } else if err != nil { + t.Errorf("application segment check with full restore got unexpected error: %v", err) + } + if err := testutil.CheckTestSegments(regs); err != nil { + t.Errorf("application segment check with full restore failed: %v", err) + } + break // Done. + } + return false + }) +} diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go index 29d457a7e..0b06a923a 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_arm64.go @@ -26,6 +26,9 @@ type kvmOneReg struct { addr uint64 } +// arm64HypercallMMIOBase is MMIO base address used to dispatch hypercalls. +var arm64HypercallMMIOBase uintptr + const KVM_NR_SPSR = 5 type userFpsimdState struct { @@ -43,6 +46,18 @@ type userRegs struct { fpRegs userFpsimdState } +type exception struct { + sErrPending uint8 + sErrHasEsr uint8 + pad [6]uint8 + sErrEsr uint64 +} + +type kvmVcpuEvents struct { + exception + rsvd [12]uint32 +} + // updateGlobalOnce does global initialization. It has to be called only once. func updateGlobalOnce(fd int) error { physicalInit() diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go new file mode 100644 index 000000000..0e3d84d95 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64_test.go @@ -0,0 +1,31 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package kvm + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" +) + +func TestKernelTLS(t *testing.T) { + bluepillTest(t, func(c *vCPU) { + if !testutil.TLSWorks() { + t.Errorf("tls does not work, and it should!") + } + }) +} diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go index 6531bae1d..48ccf8474 100644 --- a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go @@ -22,7 +22,8 @@ import ( ) var ( - runDataSize int + runDataSize int + hasGuestPCID bool ) func updateSystemValues(fd int) error { @@ -33,6 +34,7 @@ func updateSystemValues(fd int) error { } // Save the data. runDataSize = int(sz) + hasGuestPCID = true // Success. return nil diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 1d5c77ff4..6abaa21c4 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -26,15 +26,21 @@ const ( _KVM_RUN = 0xae80 _KVM_NMI = 0xae9a _KVM_CHECK_EXTENSION = 0xae03 + _KVM_GET_TSC_KHZ = 0xaea3 + _KVM_SET_TSC_KHZ = 0xaea2 _KVM_INTERRUPT = 0x4004ae86 _KVM_SET_MSRS = 0x4008ae89 _KVM_SET_USER_MEMORY_REGION = 0x4020ae46 _KVM_SET_REGS = 0x4090ae82 _KVM_SET_SREGS = 0x4138ae84 + _KVM_GET_MSRS = 0xc008ae88 _KVM_GET_REGS = 0x8090ae81 + _KVM_GET_SREGS = 0x8138ae83 _KVM_GET_SUPPORTED_CPUID = 0xc008ae05 _KVM_SET_CPUID2 = 0x4008ae90 _KVM_SET_SIGNAL_MASK = 0x4004ae8b + _KVM_GET_VCPU_EVENTS = 0x8040ae9f + _KVM_SET_VCPU_EVENTS = 0x4040aea0 ) // KVM exit reasons. @@ -54,12 +60,16 @@ const ( // KVM capability options. const ( - _KVM_CAP_MAX_VCPUS = 0x42 - _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5 + _KVM_CAP_MAX_MEMSLOTS = 0x0a + _KVM_CAP_MAX_VCPUS = 0x42 + _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5 + _KVM_CAP_VCPU_EVENTS = 0x29 + _KVM_CAP_ARM_INJECT_SERROR_ESR = 0x9e ) // KVM limits. const ( + _KVM_NR_MEMSLOTS = 0x100 _KVM_NR_VCPUS = 0xff _KVM_NR_INTERRUPTS = 0x100 _KVM_NR_CPUID_ENTRIES = 0x100 @@ -71,3 +81,16 @@ const ( _KVM_MEM_READONLY = uint32(1) << 1 _KVM_MEM_FLAGS_NONE = 0 ) + +// KVM hypercall list. +// +// Canonical list of hypercalls supported. +const ( + // On amd64, it uses 'HLT' to leave the guest. + // + // Unlike amd64, arm64 can only uses mmio_exit/psci to leave the guest. + // + // _KVM_HYPERCALL_VMEXIT is only used on arm64 for now. + _KVM_HYPERCALL_VMEXIT int = iota + _KVM_HYPERCALL_MAX +) diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go index 531ae8b1e..5831b9345 100644 --- a/pkg/sentry/platform/kvm/kvm_const_arm64.go +++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go @@ -38,6 +38,8 @@ const ( _KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080 _KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082 _KVM_ARM64_REGS_VBAR_EL1 = 0x603000000013c600 + _KVM_ARM64_REGS_TIMER_CNT = 0x603000000013df1a + _KVM_ARM64_REGS_CNTFRQ_EL0 = 0x603000000013df00 ) // Arm64: Architectural Feature Access Control Register EL1. @@ -72,6 +74,7 @@ const ( _TCR_T0SZ_VA48 = 64 - 48 // VA=48 _TCR_T1SZ_VA48 = 64 - 48 // VA=48 + _TCR_A1 = 1 << 22 _TCR_ASID16 = 1 << 36 _TCR_TBI0 = 1 << 37 @@ -100,13 +103,20 @@ const ( // Arm64: Memory Attribute Indirection Register EL1. const ( - _MT_DEVICE_nGnRnE = 0 - _MT_DEVICE_nGnRE = 1 - _MT_DEVICE_GRE = 2 - _MT_NORMAL_NC = 3 - _MT_NORMAL = 4 - _MT_NORMAL_WT = 5 - _MT_EL1_INIT = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8) + _MT_DEVICE_nGnRnE = 0 + _MT_DEVICE_nGnRE = 1 + _MT_DEVICE_GRE = 2 + _MT_NORMAL_NC = 3 + _MT_NORMAL = 4 + _MT_NORMAL_WT = 5 + _MT_ATTR_DEVICE_nGnRnE = 0x00 + _MT_ATTR_DEVICE_nGnRE = 0x04 + _MT_ATTR_DEVICE_GRE = 0x0c + _MT_ATTR_NORMAL_NC = 0x44 + _MT_ATTR_NORMAL_WT = 0xbb + _MT_ATTR_NORMAL = 0xff + _MT_ATTR_MASK = 0xff + _MT_EL1_INIT = (_MT_ATTR_DEVICE_nGnRnE << (_MT_DEVICE_nGnRnE * 8)) | (_MT_ATTR_DEVICE_nGnRE << (_MT_DEVICE_nGnRE * 8)) | (_MT_ATTR_DEVICE_GRE << (_MT_DEVICE_GRE * 8)) | (_MT_ATTR_NORMAL_NC << (_MT_NORMAL_NC * 8)) | (_MT_ATTR_NORMAL << (_MT_NORMAL * 8)) | (_MT_ATTR_NORMAL_WT << (_MT_NORMAL_WT * 8)) ) const ( @@ -116,6 +126,17 @@ const ( // Arm64: Exception Syndrome Register EL1. const ( + _ESR_ELx_EC_SHIFT = 26 + _ESR_ELx_EC_MASK = 0x3F << _ESR_ELx_EC_SHIFT + + _ESR_ELx_EC_IMP_DEF = 0x1f + _ESR_ELx_EC_IABT_LOW = 0x20 + _ESR_ELx_EC_IABT_CUR = 0x21 + _ESR_ELx_EC_PC_ALIGN = 0x22 + + _ESR_ELx_CM = 1 << 8 + _ESR_ELx_WNR = 1 << 6 + _ESR_ELx_FSC = 0x3F _ESR_SEGV_MAPERR_L0 = 0x4 @@ -131,3 +152,10 @@ const ( _ESR_SEGV_PEMERR_L2 = 0xe _ESR_SEGV_PEMERR_L3 = 0xf ) + +// Arm64: MMIO base address used to dispatch hypercalls. +const ( + // on Arm64, the MMIO address must be 64-bit aligned. + // Currently, we only need 1 hypercall: hypercall_vmexit. + _AARCH64_HYPERCALL_MMIO_SIZE = 1 << 3 +) diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index 6c8f4fa28..e58acc071 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/usermem" ) @@ -262,30 +263,6 @@ func TestRegistersFault(t *testing.T) { }) } -func TestSegments(t *testing.T) { - applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTestSegments(regs) - for { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err == platform.ErrContextInterrupt { - continue // Retry. - } else if err != nil { - t.Errorf("application segment check with full restore got unexpected error: %v", err) - } - if err := testutil.CheckTestSegments(regs); err != nil { - t.Errorf("application segment check with full restore failed: %v", err) - } - break // Done. - } - return false - }) -} - func TestBounce(t *testing.T) { applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { @@ -435,9 +412,9 @@ func TestWrongVCPU(t *testing.T) { // Basic test, one then the other. bluepill(c1) bluepill(c2) - if c2.switches == 0 { + if c2.guestExits == 0 { // Don't allow the test to proceed if this fails. - t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2) + t.Fatalf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2) } // Alternate vCPUs; we expect to need to trigger the @@ -446,11 +423,11 @@ func TestWrongVCPU(t *testing.T) { bluepill(c1) bluepill(c2) } - if count := c1.switches; count < 90 { - t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2) + if count := c1.guestExits; count < 90 { + t.Errorf("wrong vCPU#1 exits: vCPU1=%+v,vCPU2=%+v", c1, c2) } - if count := c2.switches; count < 90 { - t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2) + if count := c2.guestExits; count < 90 { + t.Errorf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2) } return false }) @@ -466,6 +443,22 @@ func TestWrongVCPU(t *testing.T) { }) } +func TestRdtsc(t *testing.T) { + var i int // Iteration count. + kvmTest(t, nil, func(c *vCPU) bool { + start := ktime.Rdtsc() + bluepill(c) + guest := ktime.Rdtsc() + redpill() + end := ktime.Rdtsc() + if start > guest || guest > end { + t.Errorf("inconsistent time: start=%d, guest=%d, end=%d", start, guest, end) + } + i++ + return i < 100 + }) +} + func BenchmarkApplicationSyscall(b *testing.B) { var ( i int // Iteration includes machine.Get() / machine.Put(). diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index f1afc74dc..f70d761fd 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -43,25 +44,31 @@ type machine struct { // kernel is the set of global structures. kernel ring0.Kernel - // mappingCache is used for mapPhysical. - mappingCache sync.Map - // mu protects vCPUs. mu sync.RWMutex // available is notified when vCPUs are available. available sync.Cond - // vCPUs are the machine vCPUs. + // vCPUsByTID are the machine vCPUs. // // These are populated dynamically. - vCPUs map[uint64]*vCPU + vCPUsByTID map[uint64]*vCPU // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. - vCPUsByID map[int]*vCPU + vCPUsByID []*vCPU // maxVCPUs is the maximum number of vCPUs supported by the machine. maxVCPUs int + + // maxSlots is the maximum number of memory slots supported by the machine. + maxSlots int + + // usedSlots is the set of used physical addresses (sorted). + usedSlots []uintptr + + // nextID is the next vCPU ID. + nextID uint32 } const ( @@ -97,8 +104,11 @@ type vCPU struct { // tid is the last set tid. tid uint64 - // switches is a count of world switches (informational only). - switches uint32 + // userExits is the count of user exits. + userExits uint64 + + // guestExits is the count of guest to host world switches. + guestExits uint64 // faults is a count of world faults (informational only). faults uint32 @@ -121,6 +131,7 @@ type vCPU struct { // vCPUArchState is the architecture-specific state. vCPUArchState + // dieState holds state related to vCPU death. dieState dieState } @@ -137,9 +148,8 @@ type dieState struct { // // Precondition: mu must be held. func (m *machine) newVCPU() *vCPU { - id := len(m.vCPUs) - // Create the vCPU. + id := int(atomic.AddUint32(&m.nextID, 1) - 1) fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) if errno != 0 { panic(fmt.Sprintf("error creating new vCPU: %v", errno)) @@ -150,7 +160,7 @@ func (m *machine) newVCPU() *vCPU { fd: int(fd), machine: m, } - c.CPU.Init(&m.kernel, c) + c.CPU.Init(&m.kernel, c.id, c) m.vCPUsByID[c.id] = c // Ensure the signal mask is correct. @@ -176,16 +186,10 @@ func (m *machine) newVCPU() *vCPU { // newMachine returns a new VM context. func newMachine(vm int) (*machine, error) { // Create the machine. - m := &machine{ - fd: vm, - vCPUs: make(map[uint64]*vCPU), - vCPUsByID: make(map[int]*vCPU), - } + m := &machine{fd: vm} m.available.L = &m.mu - m.kernel.Init(ring0.KernelOpts{ - PageTables: pagetables.New(newAllocator()), - }) + // Pull the maximum vCPUs. maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) if errno != 0 { m.maxVCPUs = _KVM_NR_VCPUS @@ -193,6 +197,21 @@ func newMachine(vm int) (*machine, error) { m.maxVCPUs = int(maxVCPUs) } log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) + m.vCPUsByTID = make(map[uint64]*vCPU) + m.vCPUsByID = make([]*vCPU, m.maxVCPUs) + m.kernel.Init(ring0.KernelOpts{ + PageTables: pagetables.New(newAllocator()), + }, m.maxVCPUs) + + // Pull the maximum slots. + maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS) + if errno != 0 { + m.maxSlots = _KVM_NR_MEMSLOTS + } else { + m.maxSlots = int(maxSlots) + } + log.Debugf("The maximum number of slots is %d.", m.maxSlots) + m.usedSlots = make([]uintptr, m.maxSlots) // Apply the physical mappings. Note that these mappings may point to // guest physical addresses that are not actually available. These @@ -205,15 +224,9 @@ func newMachine(vm int) (*machine, error) { pagetables.MapOpts{AccessType: usermem.AnyAccess}, pr.physical) - // And keep everything in the upper half. - m.kernel.PageTables.Map( - usermem.Addr(ring0.KernelStartAddress|pr.virtual), - pr.length, - pagetables.MapOpts{AccessType: usermem.AnyAccess}, - pr.physical) - return true // Keep iterating. }) + m.mapUpperHalf(m.kernel.PageTables) var physicalRegionsReadOnly []physicalRegion var physicalRegionsAvailable []physicalRegion @@ -270,10 +283,26 @@ func newMachine(vm int) (*machine, error) { return m, nil } +// hasSlot returns true iff the given address is mapped. +// +// This must be done via a linear scan. +// +//go:nosplit +func (m *machine) hasSlot(physical uintptr) bool { + for i := 0; i < len(m.usedSlots); i++ { + if p := atomic.LoadUintptr(&m.usedSlots[i]); p == physical { + return true + } + } + return false +} + // mapPhysical checks for the mapping of a physical range, and installs one if // not available. This attempts to be efficient for calls in the hot path. // // This panics on error. +// +//go:nosplit func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalRegion, flags uint32) { for end := physical + length; physical < end; { _, physicalStart, length, ok := calculateBluepillFault(physical, phyRegions) @@ -282,8 +311,8 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg panic("mapPhysical on unknown physical address") } - if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok { - // Not present in the cache; requires setting the slot. + // Is this already mapped? Check the usedSlots. + if !m.hasSlot(physicalStart) { if _, ok := handleBluepillFault(m, physical, phyRegions, flags); !ok { panic("handleBluepillFault failed") } @@ -304,7 +333,11 @@ func (m *machine) Destroy() { runtime.SetFinalizer(m, nil) // Destroy vCPUs. - for _, c := range m.vCPUs { + for _, c := range m.vCPUsByID { + if c == nil { + continue + } + // Ensure the vCPU is not still running in guest mode. This is // possible iff teardown has been done by other threads, and // somehow a single thread has not executed any system calls. @@ -331,13 +364,18 @@ func (m *machine) Destroy() { // Get gets an available vCPU. // // This will return with the OS thread locked. +// +// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points +// to the vCPU in which the OS thread TID is running. So if Get() returns with +// the corrent context in guest, the vCPU of it must be the same as what +// Get() returns. func (m *machine) Get() *vCPU { m.mu.RLock() runtime.LockOSThread() tid := procid.Current() // Check for an exact match. - if c := m.vCPUs[tid]; c != nil { + if c := m.vCPUsByTID[tid]; c != nil { c.lock() m.mu.RUnlock() return c @@ -356,7 +394,7 @@ func (m *machine) Get() *vCPU { tid = procid.Current() // Recheck for an exact match. - if c := m.vCPUs[tid]; c != nil { + if c := m.vCPUsByTID[tid]; c != nil { c.lock() m.mu.Unlock() return c @@ -364,10 +402,10 @@ func (m *machine) Get() *vCPU { for { // Scan for an available vCPU. - for origTID, c := range m.vCPUs { + for origTID, c := range m.vCPUsByTID { if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { - delete(m.vCPUs, origTID) - m.vCPUs[tid] = c + delete(m.vCPUsByTID, origTID) + m.vCPUsByTID[tid] = c m.mu.Unlock() c.loadSegments(tid) return c @@ -375,17 +413,17 @@ func (m *machine) Get() *vCPU { } // Create a new vCPU (maybe). - if len(m.vCPUs) < m.maxVCPUs { + if int(m.nextID) < m.maxVCPUs { c := m.newVCPU() c.lock() - m.vCPUs[tid] = c + m.vCPUsByTID[tid] = c m.mu.Unlock() c.loadSegments(tid) return c } // Scan for something not in user mode. - for origTID, c := range m.vCPUs { + for origTID, c := range m.vCPUsByTID { if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) { continue } @@ -403,8 +441,8 @@ func (m *machine) Get() *vCPU { } // Steal the vCPU. - delete(m.vCPUs, origTID) - m.vCPUs[tid] = c + delete(m.vCPUsByTID, origTID) + m.vCPUsByTID[tid] = c m.mu.Unlock() c.loadSegments(tid) return c @@ -431,7 +469,20 @@ func (m *machine) Put(c *vCPU) { // newDirtySet returns a new dirty set. func (m *machine) newDirtySet() *dirtySet { return &dirtySet{ - vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), + vCPUMasks: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), + } +} + +// dropPageTables drops cached page table entries. +func (m *machine) dropPageTables(pt *pagetables.PageTables) { + m.mu.Lock() + defer m.mu.Unlock() + + // Clear from all PCIDs. + for _, c := range m.vCPUsByID { + if c != nil && c.PCIDs != nil { + c.PCIDs.Drop(pt) + } } } @@ -494,6 +545,8 @@ var pid = syscall.Getpid() // // This effectively unwinds the state machine. func (c *vCPU) bounce(forceGuestExit bool) { + origGuestExits := atomic.LoadUint64(&c.guestExits) + origUserExits := atomic.LoadUint64(&c.userExits) for { switch state := atomic.LoadUint32(&c.state); state { case vCPUReady, vCPUWaiter: @@ -549,6 +602,14 @@ func (c *vCPU) bounce(forceGuestExit bool) { // Should not happen: the above is exhaustive. panic("invalid state") } + + // Check if we've missed the state transition, but + // we can safely return at this point in time. + newGuestExits := atomic.LoadUint64(&c.guestExits) + newUserExits := atomic.LoadUint64(&c.userExits) + if newUserExits != origUserExits && (!forceGuestExit || newGuestExits != origGuestExits) { + return + } } } @@ -565,3 +626,35 @@ func (c *vCPU) BounceToKernel() { func (c *vCPU) BounceToHost() { c.bounce(true) } + +// setSystemTimeLegacy calibrates and sets an approximate system time. +func (c *vCPU) setSystemTimeLegacy() error { + const minIterations = 10 + minimum := uint64(0) + for iter := 0; ; iter++ { + // Try to set the TSC to an estimate of where it will be + // on the host during a "fast" system call iteration. + start := uint64(ktime.Rdtsc()) + if err := c.setTSC(start + (minimum / 2)); err != nil { + return err + } + // See if this is our new minimum call time. Note that this + // serves two functions: one, we make sure that we are + // accurately predicting the offset we need to set. Second, we + // don't want to do the final set on a slow call, which could + // produce a really bad result. + end := uint64(ktime.Rdtsc()) + if end < start { + continue // Totally bogus: unstable TSC? + } + current := end - start + if current < minimum || iter == 0 { + minimum = current // Set our new minimum. + } + // Is this past minIterations and within ~10% of minimum? + upperThreshold := (((minimum << 3) + minimum) >> 3) + if iter >= minIterations && current <= upperThreshold { + return nil + } + } +} diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 923ce3909..a8b729e62 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -18,14 +18,17 @@ package kvm import ( "fmt" + "math/big" "reflect" "runtime/debug" "syscall" + "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + ktime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/usermem" ) @@ -51,9 +54,10 @@ func (m *machine) initArchState() error { recover() debug.SetPanicOnFault(old) }() - m.retryInGuest(func() { - ring0.SetCPUIDFaulting(true) - }) + c := m.Get() + defer m.Put(c) + bluepill(c) + ring0.SetCPUIDFaulting(true) return nil } @@ -83,19 +87,6 @@ const ( poolPCIDs = 8 ) -// dropPageTables drops cached page table entries. -func (m *machine) dropPageTables(pt *pagetables.PageTables) { - m.mu.Lock() - defer m.mu.Unlock() - - // Clear from all PCIDs. - for _, c := range m.vCPUs { - if c.PCIDs != nil { - c.PCIDs.Drop(pt) - } - } -} - // initArchState initializes architecture-specific state. func (c *vCPU) initArchState() error { var ( @@ -143,6 +134,7 @@ func (c *vCPU) initArchState() error { // Set the entrypoint for the kernel. kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) + kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet // Set the system registers. @@ -151,8 +143,8 @@ func (c *vCPU) initArchState() error { } // Set the user registers. - if err := c.setUserRegisters(&kernelUserRegs); err != nil { - return err + if errno := c.setUserRegisters(&kernelUserRegs); errno != 0 { + return fmt.Errorf("error setting user registers: %v", errno) } // Allocate some floating point state save area for the local vCPU. @@ -165,6 +157,101 @@ func (c *vCPU) initArchState() error { return c.setSystemTime() } +// bitsForScaling returns the bits available for storing the fraction component +// of the TSC scaling ratio. This allows us to replicate the (bad) math done by +// the kernel below in scaledTSC, and ensure we can compute an exact zero +// offset in setSystemTime. +// +// These constants correspond to kvm_tsc_scaling_ratio_frac_bits. +var bitsForScaling = func() int64 { + fs := cpuid.HostFeatureSet() + if fs.Intel() { + return 48 // See vmx.c (kvm sources). + } else if fs.AMD() { + return 32 // See svm.c (svm sources). + } else { + return 63 // Unknown: theoretical maximum. + } +}() + +// scaledTSC returns the host TSC scaled by the given frequency. +// +// This assumes a current frequency of 1. We require only the unitless ratio of +// rawFreq to some current frequency. See setSystemTime for context. +// +// The kernel math guarantees that all bits of the multiplication and division +// will be correctly preserved and applied. However, it is not possible to +// actually store the ratio correctly. So we need to use the same schema in +// order to calculate the scaled frequency and get the same result. +// +// We can assume that the current frequency is (1), so we are calculating a +// strict inverse of this value. This simplifies this function considerably. +// +// Roughly, the returned value "scaledTSC" will have: +// scaledTSC/hostTSC == 1/rawFreq +// +//go:nosplit +func scaledTSC(rawFreq uintptr) int64 { + scale := int64(1 << bitsForScaling) + ratio := big.NewInt(scale / int64(rawFreq)) + ratio.Mul(ratio, big.NewInt(int64(ktime.Rdtsc()))) + ratio.Div(ratio, big.NewInt(scale)) + return ratio.Int64() +} + +// setSystemTime sets the vCPU to the system time. +func (c *vCPU) setSystemTime() error { + // First, scale down the clock frequency to the lowest value allowed by + // the API itself. How low we can go depends on the underlying + // hardware, but it is typically ~1/2^48 for Intel, ~1/2^32 for AMD. + // Even the lower bound here will take a 4GHz frequency down to 1Hz, + // meaning that everything should be able to handle a Khz setting of 1 + // with bits to spare. + // + // Note that reducing the clock does not typically require special + // capabilities as it is emulated in KVM. We don't actually use this + // capability, but it means that this method should be robust to + // different hardware configurations. + rawFreq, err := c.getTSCFreq() + if err != nil { + return c.setSystemTimeLegacy() + } + if err := c.setTSCFreq(1); err != nil { + return c.setSystemTimeLegacy() + } + + // Always restore the original frequency. + defer func() { + if err := c.setTSCFreq(rawFreq); err != nil { + panic(err.Error()) + } + }() + + // Attempt to set the system time in this compressed world. The + // calculation for offset normally looks like: + // + // offset = target_tsc - kvm_scale_tsc(vcpu, rdtsc()); + // + // So as long as the kvm_scale_tsc component is constant before and + // after the call to set the TSC value (and it is passes as the + // target_tsc), we will compute an offset value of zero. + // + // This is effectively cheating to make our "setSystemTime" call so + // unbelievably, incredibly fast that we do it "instantly" and all the + // calculations result in an offset of zero. + lastTSC := scaledTSC(rawFreq) + for { + if err := c.setTSC(uint64(lastTSC)); err != nil { + return err + } + nextTSC := scaledTSC(rawFreq) + if lastTSC == nextTSC { + return nil + } + lastTSC = nextTSC // Try again. + } +} + // nonCanonical generates a canonical address return. // //go:nosplit @@ -335,29 +422,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) } } -// retryInGuest runs the given function in guest mode. -// -// If the function does not complete in guest mode (due to execution of a -// system call due to a GC stall, for example), then it will be retried. The -// given function must be idempotent as a result of the retry mechanism. -func (m *machine) retryInGuest(fn func()) { - c := m.Get() - defer m.Put(c) - for { - c.ClearErrorCode() // See below. - bluepill(c) // Force guest mode. - fn() // Execute the given function. - _, user := c.ErrorCode() - if user { - // If user is set, then we haven't bailed back to host - // mode via a kernel exception or system call. We - // consider the full function to have executed in guest - // mode and we can return. - break - } - } -} - // On x86 platform, the flags for "setMemoryRegion" can always be set as 0. // There is no need to return read-only physicalRegions. func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { @@ -367,3 +431,41 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { func availableRegionsForSetMem() (phyRegions []physicalRegion) { return physicalRegions } + +var execRegions = func() (regions []region) { + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { + return + } + if vr.accessType.Execute { + regions = append(regions, vr.region) + } + }) + return +}() + +func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { + for _, r := range execRegions { + physical, length, ok := translateToPhysical(r.virtual) + if !ok || length < r.length { + panic("impossilbe translation") + } + pageTable.Map( + usermem.Addr(ring0.KernelStartAddress|r.virtual), + r.length, + pagetables.MapOpts{AccessType: usermem.Execute}, + physical) + } + for start, end := range m.kernel.EntryRegions() { + regionLen := end - start + physical, length, ok := translateToPhysical(start) + if !ok || length < regionLen { + panic("impossible translation") + } + pageTable.Map( + usermem.Addr(ring0.KernelStartAddress|start), + regionLen, + pagetables.MapOpts{AccessType: usermem.ReadWrite}, + physical) + } +} diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 7156c245f..b430f92c6 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -23,7 +23,6 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/time" ) // loadSegments copies the current segments. @@ -61,91 +60,63 @@ func (c *vCPU) setCPUID() error { return nil } -// setSystemTime sets the TSC for the vCPU. +// getTSCFreq gets the TSC frequency. // -// This has to make the call many times in order to minimize the intrinsic -// error in the offset. Unfortunately KVM does not expose a relative offset via -// the API, so this is an approximation. We do this via an iterative algorithm. -// This has the advantage that it can generally deal with highly variable -// system call times and should converge on the correct offset. -func (c *vCPU) setSystemTime() error { - const ( - _MSR_IA32_TSC = 0x00000010 - calibrateTries = 10 - ) - registers := modelControlRegisters{ - nmsrs: 1, - } - registers.entries[0] = modelControlRegister{ - index: _MSR_IA32_TSC, +// If mustSucceed is true, then this function panics on error. +func (c *vCPU) getTSCFreq() (uintptr, error) { + rawFreq, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_TSC_KHZ, + 0 /* ignored */) + if errno != 0 { + return 0, errno } - target := uint64(^uint32(0)) - for done := 0; done < calibrateTries; { - start := uint64(time.Rdtsc()) - registers.entries[0].data = start + target - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_MSRS, - uintptr(unsafe.Pointer(®isters))); errno != 0 { - return fmt.Errorf("error setting system time: %v", errno) - } - // See if this is our new minimum call time. Note that this - // serves two functions: one, we make sure that we are - // accurately predicting the offset we need to set. Second, we - // don't want to do the final set on a slow call, which could - // produce a really bad result. So we only count attempts - // within +/- 6.25% of our minimum as an attempt. - end := uint64(time.Rdtsc()) - if end < start { - continue // Totally bogus. - } - half := (end - start) / 2 - if half < target { - target = half - } - if (half - target) < target/8 { - done++ - } + return rawFreq, nil +} + +// setTSCFreq sets the TSC frequency. +func (c *vCPU) setTSCFreq(freq uintptr) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_TSC_KHZ, + freq /* khz */); errno != 0 { + return fmt.Errorf("error setting TSC frequency: %v", errno) } return nil } -// setSignalMask sets the vCPU signal mask. -// -// This must be called prior to running the vCPU. -func (c *vCPU) setSignalMask() error { - // The layout of this structure implies that it will not necessarily be - // the same layout chosen by the Go compiler. It gets fudged here. - var data struct { - length uint32 - mask1 uint32 - mask2 uint32 - _ uint32 +// setTSC sets the TSC value. +func (c *vCPU) setTSC(value uint64) error { + const _MSR_IA32_TSC = 0x00000010 + registers := modelControlRegisters{ + nmsrs: 1, } - data.length = 8 // Fixed sigset size. - data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) - data.mask2 = ^uint32(bounceSignalMask >> 32) + registers.entries[0].index = _MSR_IA32_TSC + registers.entries[0].data = value if _, _, errno := syscall.RawSyscall( syscall.SYS_IOCTL, uintptr(c.fd), - _KVM_SET_SIGNAL_MASK, - uintptr(unsafe.Pointer(&data))); errno != 0 { - return fmt.Errorf("error setting signal mask: %v", errno) + _KVM_SET_MSRS, + uintptr(unsafe.Pointer(®isters))); errno != 0 { + return fmt.Errorf("error setting tsc: %v", errno) } return nil } // setUserRegisters sets user registers in the vCPU. -func (c *vCPU) setUserRegisters(uregs *userRegs) error { +// +//go:nosplit +func (c *vCPU) setUserRegisters(uregs *userRegs) syscall.Errno { if _, _, errno := syscall.RawSyscall( syscall.SYS_IOCTL, uintptr(c.fd), _KVM_SET_REGS, uintptr(unsafe.Pointer(uregs))); errno != 0 { - return fmt.Errorf("error setting user registers: %v", errno) + return errno } - return nil + return 0 } // getUserRegisters reloads user registers in the vCPU. @@ -154,7 +125,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) error { // //go:nosplit func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno { - if _, _, errno := syscall.RawSyscall( + if _, _, errno := syscall.RawSyscall( // escapes: no. syscall.SYS_IOCTL, uintptr(c.fd), _KVM_GET_REGS, @@ -175,3 +146,17 @@ func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { } return nil } + +// getSystemRegisters sets system registers. +// +//go:nosplit +func (c *vCPU) getSystemRegisters(sregs *systemRegs) syscall.Errno { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_SREGS, + uintptr(unsafe.Pointer(sregs))); errno != 0 { + return errno + } + return 0 +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 750751aa3..54837f20c 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -19,6 +19,7 @@ package kvm import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/usermem" ) @@ -48,6 +49,18 @@ const ( poolPCIDs = 8 ) +func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { + applyPhysicalRegions(func(pr physicalRegion) bool { + pageTable.Map( + usermem.Addr(ring0.KernelStartAddress|pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + + return true // Keep iterating. + }) +} + // Get all read-only physicalRegions. func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { var rdonlyRegions []region @@ -100,19 +113,6 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) { return phyRegions } -// dropPageTables drops cached page table entries. -func (m *machine) dropPageTables(pt *pagetables.PageTables) { - m.mu.Lock() - defer m.mu.Unlock() - - // Clear from all PCIDs. - for _, c := range m.vCPUs { - if c.PCIDs != nil { - c.PCIDs.Drop(pt) - } - } -} - // nonCanonical generates a canonical address return. // //go:nosplit @@ -125,71 +125,59 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.Acc return usermem.NoAccess, platform.ErrContextSignal } +// isInstructionAbort returns true if it is an instruction abort. +// +//go:nosplit +func isInstructionAbort(code uint64) bool { + value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT + return value == _ESR_ELx_EC_IABT_LOW +} + +// isWriteFault returns whether it is a write fault. +// +//go:nosplit +func isWriteFault(code uint64) bool { + if isInstructionAbort(code) { + return false + } + + return (code & _ESR_ELx_WNR) != 0 +} + // fault generates an appropriate fault return. // //go:nosplit func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + bluepill(c) // Probably no-op, but may not be. faultAddr := c.GetFaultAddr() code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return usermem.NoAccess, platform.ErrContextInterrupt + } + // Reset the pointed SignalInfo. *info = arch.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) - read := true - write := false - execute := true - ret := code & _ESR_ELx_FSC switch ret { case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: info.Code = 1 //SEGV_MAPERR - read = false - write = true - execute = false case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: info.Code = 2 // SEGV_ACCERR. - read = true - write = false - execute = false default: info.Code = 2 } - if !user { - read = true - write = false - execute = true - - } accessType := usermem.AccessType{ - Read: read, - Write: write, - Execute: execute, + Read: !isWriteFault(uint64(code)), + Write: isWriteFault(uint64(code)), + Execute: isInstructionAbort(uint64(code)), } return accessType, platform.ErrContextSignal } - -// retryInGuest runs the given function in guest mode. -// -// If the function does not complete in guest mode (due to execution of a -// system call due to a GC stall, for example), then it will be retried. The -// given function must be idempotent as a result of the retry mechanism. -func (m *machine) retryInGuest(fn func()) { - c := m.Get() - defer m.Put(c) - for { - c.ClearErrorCode() // See below. - bluepill(c) // Force guest mode. - fn() // Execute the given function. - _, user := c.ErrorCode() - if user { - // If user is set, then we haven't bailed back to host - // mode via a kernel exception or system call. We - // consider the full function to have executed in guest - // mode and we can return. - break - } - } -} diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 3c02cef7c..1344ed3c9 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/usermem" ) @@ -60,7 +61,6 @@ func (c *vCPU) initArchState() error { reg.addr = uint64(reflect.ValueOf(&data).Pointer()) regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer()) - vcpuInit.target = _KVM_ARM_TARGET_GENERIC_V8 vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2) if _, _, errno := syscall.RawSyscall( syscall.SYS_IOCTL, @@ -78,21 +78,8 @@ func (c *vCPU) initArchState() error { return err } - // sctlr_el1 - regGet.id = _KVM_ARM64_REGS_SCTLR_EL1 - if err := c.getOneRegister(®Get); err != nil { - return err - } - - dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I) - data = dataGet - reg.id = _KVM_ARM64_REGS_SCTLR_EL1 - if err := c.setOneRegister(®); err != nil { - return err - } - // tcr_el1 - data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS + data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS | _TCR_A1 reg.id = _KVM_ARM64_REGS_TCR_EL1 if err := c.setOneRegister(®); err != nil { return err @@ -116,7 +103,7 @@ func (c *vCPU) initArchState() error { c.SetTtbr0Kvm(uintptr(data)) // ttbr1_el1 - data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0) + data = c.machine.kernel.PageTables.TTBR1_EL1(false, 1) reg.id = _KVM_ARM64_REGS_TTBR1_EL1 if err := c.setOneRegister(®); err != nil { @@ -159,16 +146,46 @@ func (c *vCPU) initArchState() error { return err } - data = ring0.PsrDefaultSet | ring0.KernelFlagsSet - reg.id = _KVM_ARM64_REGS_PSTATE + // Use the address of the exception vector table as + // the MMIO address base. + arm64HypercallMMIOBase = toLocation + + // Initialize the PCID database. + if hasGuestPCID { + // Note that NewPCIDs may return a nil table here, in which + // case we simply don't use PCID support (see below). In + // practice, this should not happen, however. + c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) + } + + c.floatingPointState = arch.NewFloatingPointData() + + return c.setSystemTime() +} + +// setTSC sets the counter Virtual Offset. +func (c *vCPU) setTSC(value uint64) error { + var ( + reg kvmOneReg + data uint64 + ) + + reg.addr = uint64(reflect.ValueOf(&data).Pointer()) + reg.id = _KVM_ARM64_REGS_TIMER_CNT + data = uint64(value) + if err := c.setOneRegister(®); err != nil { return err } - c.floatingPointState = arch.NewFloatingPointData() return nil } +// setSystemTime sets the vCPU to the system time. +func (c *vCPU) setSystemTime() error { + return c.setSystemTimeLegacy() +} + //go:nosplit func (c *vCPU) loadSegments(tid uint64) { // TODO(gvisor.dev/issue/1238): TLS is not supported. @@ -198,42 +215,6 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error { return nil } -// setCPUID sets the CPUID to be used by the guest. -func (c *vCPU) setCPUID() error { - return nil -} - -// setSystemTime sets the TSC for the vCPU. -func (c *vCPU) setSystemTime() error { - return nil -} - -// setSignalMask sets the vCPU signal mask. -// -// This must be called prior to running the vCPU. -func (c *vCPU) setSignalMask() error { - // The layout of this structure implies that it will not necessarily be - // the same layout chosen by the Go compiler. It gets fudged here. - var data struct { - length uint32 - mask1 uint32 - mask2 uint32 - _ uint32 - } - data.length = 8 // Fixed sigset size. - data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) - data.mask2 = ^uint32(bounceSignalMask >> 32) - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SIGNAL_MASK, - uintptr(unsafe.Pointer(&data))); errno != 0 { - return fmt.Errorf("error setting signal mask: %v", errno) - } - - return nil -} - // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { // Check for canonical addresses. @@ -243,6 +224,13 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info) } + // Assign PCIDs. + if c.PCIDs != nil { + var requireFlushPCID bool // Force a flush? + switchOpts.UserASID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) + switchOpts.Flush = switchOpts.Flush || requireFlushPCID + } + var vector ring0.Vector ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0) c.SetTtbr0App(uintptr(ttbr0App)) @@ -269,10 +257,19 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) case ring0.PageFault: return c.fault(int32(syscall.SIGSEGV), info) - case 0xaa: - return usermem.NoAccess, nil + case ring0.Vector(bounce): // ring0.VirtualizationException + return usermem.NoAccess, platform.ErrContextInterrupt + case ring0.El0Sync_undef: + return c.fault(int32(syscall.SIGILL), info) + case ring0.El1Sync_undef: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGILL), + Code: 1, // ILL_ILLOPC (illegal opcode). + } + info.SetAddr(switchOpts.Registers.Pc) // Include address. + return usermem.AccessType{}, platform.ErrContextSignal default: - return usermem.NoAccess, platform.ErrContextSignal + panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } } diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index de7df4f80..1d6ca245a 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.16 +// +build !go1.17 // Check go:linkname function signatures when updating Go version. @@ -115,7 +115,7 @@ func (a *atomicAddressSpace) get() *addressSpace { // //go:nosplit func (c *vCPU) notify() { - _, _, errno := syscall.RawSyscall6( + _, _, errno := syscall.RawSyscall6( // escapes: no. syscall.SYS_FUTEX, uintptr(unsafe.Pointer(&c.state)), linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, @@ -143,3 +143,29 @@ func (c *vCPU) waitUntilNot(state uint32) { panic("futex wait error") } } + +// setSignalMask sets the vCPU signal mask. +// +// This must be called prior to running the vCPU. +func (c *vCPU) setSignalMask() error { + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) + data.mask2 = ^uint32(bounceSignalMask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + return fmt.Errorf("error setting signal mask: %v", errno) + } + + return nil +} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go index ca902c8c1..c5235ca9d 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go @@ -23,6 +23,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) +// TLSWorks is a tls test. +// +// It returns true or false. +func TLSWorks() bool + // SetTestTarget sets the rip appropriately. func SetTestTarget(regs *arch.Registers, fn func()) { regs.Pc = uint64(reflect.ValueOf(fn).Pointer()) @@ -56,5 +61,9 @@ func CheckTestRegs(regs *arch.Registers, full bool) (err error) { err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need) } } + // Check tls. + if need := ^uint64(11); regs.TPIDR_EL0 != need { + err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need) + } return } diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s index 0bebee852..7348c29a5 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s @@ -50,8 +50,26 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0 start: B start +TEXT ·TLSWorks(SB),NOSPLIT,$0-8 + NO_LOCAL_POINTERS + MOVD $0x6789, R5 + MSR R5, TPIDR_EL0 + MOVD $SYS_GETPID, R8 // getpid + SVC + MRS TPIDR_EL0, R6 + CMP R5, R6 + BNE isNaN + MOVD $1, R0 + MOVD R0, ret+0(FP) + RET +isNaN: + MOVD $0, ret+0(FP) + RET + TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS + // gc will touch fpsimd, so we should test it. + // such as in <runtime.deductSweepCredit>. FMOVD $(9.9), F0 MOVD $SYS_GETPID, R8 // getpid SVC @@ -102,5 +120,15 @@ isNaN: TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 TWIDDLE_REGS() + MSR R10, TPIDR_EL0 + // Trapped in el0_svc. SVC RET // never reached + +TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 + TWIDDLE_REGS() + MSR R10, TPIDR_EL0 + // Trapped in el0_ia. + // Branch to Register branches unconditionally to an address in <Rn>. + JMP (R6) // <=> br x6, must fault + RET // never reached diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go index c8897d34f..4dcdbf8a7 100644 --- a/pkg/sentry/platform/kvm/virtual_map.go +++ b/pkg/sentry/platform/kvm/virtual_map.go @@ -34,7 +34,7 @@ type virtualRegion struct { } // mapsLine matches a single line from /proc/PID/maps. -var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)") +var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)") // excludeRegion returns true if these regions should be excluded from the // physical map. Virtual regions need to be excluded if get_user_pages will diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 171513f3f..dcfe839a7 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,9 +22,11 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/hostmm" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) @@ -51,6 +53,10 @@ type Platform interface { // can reliably return ErrContextCPUPreempted. DetectsCPUPreemption() bool + // HaveGlobalMemoryBarrier returns true if the GlobalMemoryBarrier method + // is supported. + HaveGlobalMemoryBarrier() bool + // MapUnit returns the alignment used for optional mappings into this // platform's AddressSpaces. Higher values indicate lower per-page costs // for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates @@ -96,6 +102,15 @@ type Platform interface { // called. PreemptAllCPUs() error + // GlobalMemoryBarrier blocks until all threads running application code + // (via Context.Switch) and all task goroutines "have passed through a + // state where all memory accesses to user-space addresses match program + // order between entry to and return from [GlobalMemoryBarrier]", as for + // membarrier(2). + // + // Preconditions: HaveGlobalMemoryBarrier() == true. + GlobalMemoryBarrier() error + // SyscallFilters returns syscalls made exclusively by this platform. SyscallFilters() seccomp.SyscallRules } @@ -114,6 +129,54 @@ func (NoCPUPreemptionDetection) PreemptAllCPUs() error { panic("This platform does not support CPU preemption detection") } +// UseHostGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and +// Platform.GlobalMemoryBarrier by invoking equivalent functionality on the +// host. +type UseHostGlobalMemoryBarrier struct{} + +// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier. +func (UseHostGlobalMemoryBarrier) HaveGlobalMemoryBarrier() bool { + return hostmm.HaveGlobalMemoryBarrier() +} + +// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier. +func (UseHostGlobalMemoryBarrier) GlobalMemoryBarrier() error { + return hostmm.GlobalMemoryBarrier() +} + +// UseHostProcessMemoryBarrier implements Platform.HaveGlobalMemoryBarrier and +// Platform.GlobalMemoryBarrier by invoking a process-local memory barrier. +// This is faster than UseHostGlobalMemoryBarrier, but is only appropriate for +// platforms for which application code executes while using the sentry's +// mm_struct. +type UseHostProcessMemoryBarrier struct{} + +// HaveGlobalMemoryBarrier implements Platform.HaveGlobalMemoryBarrier. +func (UseHostProcessMemoryBarrier) HaveGlobalMemoryBarrier() bool { + // Fall back to a global memory barrier if a process-local one isn't + // available. + return hostmm.HaveProcessMemoryBarrier() || hostmm.HaveGlobalMemoryBarrier() +} + +// GlobalMemoryBarrier implements Platform.GlobalMemoryBarrier. +func (UseHostProcessMemoryBarrier) GlobalMemoryBarrier() error { + if hostmm.HaveProcessMemoryBarrier() { + return hostmm.ProcessMemoryBarrier() + } + return hostmm.GlobalMemoryBarrier() +} + +// MemoryManager represents an abstraction above the platform address space +// which manages memory mappings and their contents. +type MemoryManager interface { + //usermem.IO provides access to the contents of a virtual memory space. + usermem.IO + // MMap establishes a memory mapping. + MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) + // AddressSpace returns the AddressSpace bound to mm. + AddressSpace() AddressSpace +} + // Context represents the execution context for a single thread. type Context interface { // Switch resumes execution of the thread specified by the arch.Context @@ -143,7 +206,36 @@ type Context interface { // concurrent call to Switch(). // // - ErrContextCPUPreempted: See the definition of that error for details. - Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) + Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) + + // PullFullState() pulls a full state of the application thread. + // + // A platform can support lazy loading/restoring of a thread state + // which includes registers and a floating point state. + // + // For example, when the Sentry handles a system call, it may have only + // syscall arguments without other registers and a floating point + // state. And in this case, if the Sentry will need to construct a + // signal frame to call a signal handler, it will need to call + // PullFullState() to load all registers and FPU state. + // + // Preconditions: The caller must be running on the task goroutine. + PullFullState(as AddressSpace, ac arch.Context) + + // FullStateChanged() indicates that a thread state has been changed by + // the Sentry. This happens in case of the rt_sigreturn, execve, etc. + // + // First, it indicates that the Sentry has the full state of the thread + // and PullFullState() has to do nothing if it is called after + // FullStateChanged(). + // + // Second, it forces restoring the full state of the application + // thread. A platform can support lazy loading/restoring of a thread + // state. This means that if the Sentry has not changed a thread state, + // the platform may not restore it. + // + // Preconditions: The caller must be running on the task goroutine. + FullStateChanged() // Interrupt interrupts a concurrent call to Switch(), causing it to return // ErrContextInterrupt. @@ -204,20 +296,32 @@ type AddressSpace interface { // physical memory) to the mapping. The precommit flag is advisory and // implementations may choose to ignore it. // - // Preconditions: addr and fr must be page-aligned. fr.Length() > 0. - // at.Any() == true. At least one reference must be held on all pages in - // fr, and must continue to be held as long as pages are mapped. - MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error + // Preconditions: + // * addr and fr must be page-aligned. + // * fr.Length() > 0. + // * at.Any() == true. + // * At least one reference must be held on all pages in fr, and must + // continue to be held as long as pages are mapped. + MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error // Unmap unmaps the given range. // - // Preconditions: addr is page-aligned. length > 0. + // Preconditions: + // * addr is page-aligned. + // * length > 0. Unmap(addr usermem.Addr, length uint64) // Release releases this address space. After releasing, a new AddressSpace // must be acquired via platform.NewAddressSpace(). Release() + // PreFork() is called before creating a copy of AddressSpace. This + // guarantees that this address space will be in a consistent state. + PreFork() + + // PostFork() is called after creating a copy of AddressSpace. + PostFork() + // AddressSpaceIO methods are supported iff the associated platform's // Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this // does not hold may panic if AddressSpaceIO methods are invoked. @@ -310,52 +414,6 @@ func (f SegmentationFault) Error() string { return fmt.Sprintf("segmentation fault at %#x", f.Addr) } -// File represents a host file that may be mapped into an AddressSpace. -type File interface { - // All pages in a File are reference-counted. - - // IncRef increments the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. (The File - // interface does not provide a way to acquire an initial reference; - // implementors may define mechanisms for doing so.) - IncRef(fr FileRange) - - // DecRef decrements the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. - DecRef(fr FileRange) - - // MapInternal returns a mapping of the given file offsets in the invoking - // process' address space for reading and writing. - // - // Note that fr.Start and fr.End need not be page-aligned. - // - // Preconditions: fr.Length() > 0. At least one reference must be held on - // all pages in fr. - // - // Postconditions: The returned mapping is valid as long as at least one - // reference is held on the mapped pages. - MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) - - // FD returns the file descriptor represented by the File. - // - // The only permitted operation on the returned file descriptor is to map - // pages from it consistent with the requirements of AddressSpace.MapFile. - FD() int -} - -// FileRange represents a range of uint64 offsets into a File. -// -// type FileRange <generated using go_generics> - -// String implements fmt.Stringer.String. -func (fr FileRange) String() string { - return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) -} - // Requirements is used to specify platform specific requirements. type Requirements struct { // RequiresCurrentPIDNS indicates that the sandbox has to be started in the diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 30402c2df..fc43cc3c0 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -24,12 +24,13 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/procid", "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/hostcpu", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sync", diff --git a/pkg/sentry/platform/ptrace/filters.go b/pkg/sentry/platform/ptrace/filters.go index 1e07cfd0d..b0970e356 100644 --- a/pkg/sentry/platform/ptrace/filters.go +++ b/pkg/sentry/platform/ptrace/filters.go @@ -24,10 +24,9 @@ import ( // SyscallFilters returns syscalls made exclusively by the ptrace platform. func (*PTrace) SyscallFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ - unix.SYS_GETCPU: {}, - unix.SYS_SCHED_SETAFFINITY: {}, - syscall.SYS_PTRACE: {}, - syscall.SYS_TGKILL: {}, - syscall.SYS_WAIT4: {}, + unix.SYS_GETCPU: {}, + syscall.SYS_PTRACE: {}, + syscall.SYS_TGKILL: {}, + syscall.SYS_WAIT4: {}, } } diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 08d055e05..f56aa3b79 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -48,6 +48,7 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" @@ -95,7 +96,8 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) { + as := mm.AddressSpace() s := as.(*subprocess) isSyscall := s.switchToApp(c, ac) @@ -180,10 +182,17 @@ func (c *context) Interrupt() { // Release implements platform.Context.Release(). func (c *context) Release() {} +// FullStateChanged implements platform.Context.FullStateChanged. +func (c *context) FullStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac arch.Context) {} + // PTrace represents a collection of ptrace subprocesses. type PTrace struct { platform.MMapMinAddr platform.NoCPUPreemptionDetection + platform.UseHostGlobalMemoryBarrier } // New returns a new ptrace-based implementation of the platform interface. diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 2389423b0..812ab80ef 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -517,11 +518,6 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { } defer c.interrupt.Disable() - // Ensure that the CPU set is bound appropriately; this makes the - // emulation below several times faster, presumably by avoiding - // interprocessor wakeups and by simplifying the schedule. - t.bind() - // Set registers. if err := t.setRegs(regs); err != nil { panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err)) @@ -616,7 +612,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp } // MapFile implements platform.AddressSpace.MapFile. -func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { var flags int if precommit { flags |= syscall.MAP_POPULATE @@ -661,3 +657,9 @@ func (s *subprocess) Unmap(addr usermem.Addr, length uint64) { panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err)) } } + +// PreFork implements platform.AddressSpace.PreFork. +func (s *subprocess) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (s *subprocess) PostFork() {} diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 84b699f0d..020bbda79 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -201,7 +201,7 @@ func appendArchSeccompRules(rules []seccomp.RuleSet, defaultAction linux.BPFActi seccomp.RuleSet{ Rules: seccomp.SyscallRules{ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)}, + {seccomp.EqualTo(linux.ARCH_SET_CPUID), seccomp.EqualTo(0)}, }, }, Action: linux.SECCOMP_RET_ALLOW, diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go index 2ce528601..8548853da 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux.go @@ -80,9 +80,9 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro Rules: seccomp.SyscallRules{ syscall.SYS_CLONE: []seccomp.Rule{ // Allow creation of new subprocesses (used by the master). - {seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)}, + {seccomp.EqualTo(syscall.CLONE_FILES | syscall.SIGKILL)}, // Allow creation of new threads within a single address space (used by addresss spaces). - {seccomp.AllowValue( + {seccomp.EqualTo( syscall.CLONE_FILES | syscall.CLONE_FS | syscall.CLONE_SIGHAND | @@ -97,14 +97,14 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro // For the stub prctl dance (all). syscall.SYS_PRCTL: []seccomp.Rule{ - {seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)}, + {seccomp.EqualTo(syscall.PR_SET_PDEATHSIG), seccomp.EqualTo(syscall.SIGKILL)}, }, syscall.SYS_GETPPID: {}, // For the stub to stop itself (all). syscall.SYS_GETPID: {}, syscall.SYS_KILL: []seccomp.Rule{ - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SIGSTOP)}, }, // Injected to support the address space operations. @@ -115,7 +115,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro }) } rules = appendArchSeccompRules(rules, defaultAction) - instrs, err := seccomp.BuildProgram(rules, defaultAction) + instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction) if err != nil { return nil, err } diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index 245b20722..533e45497 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -18,29 +18,12 @@ package ptrace import ( - "sync/atomic" "syscall" "unsafe" - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/hostcpu" - "gvisor.dev/gvisor/pkg/sync" ) -// maskPool contains reusable CPU masks for setting affinity. Unfortunately, -// runtime.NumCPU doesn't actually record the number of CPUs on the system, it -// just records the number of CPUs available in the scheduler affinity set at -// startup. This may a) change over time and b) gives a number far lower than -// the maximum indexable CPU. To prevent lots of allocation in the hot path, we -// use a pool to store large masks that we can reuse during bind. -var maskPool = sync.Pool{ - New: func() interface{} { - const maxCPUs = 1024 // Not a hard limit; see below. - return make([]uintptr, maxCPUs/64) - }, -} - // unmaskAllSignals unmasks all signals on the current thread. // //go:nosplit @@ -49,47 +32,3 @@ func unmaskAllSignals() syscall.Errno { _, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0) return errno } - -// setCPU sets the CPU affinity. -func (t *thread) setCPU(cpu uint32) error { - mask := maskPool.Get().([]uintptr) - n := int(cpu / 64) - v := uintptr(1 << uintptr(cpu%64)) - if n >= len(mask) { - // See maskPool note above. We've actually exceeded the number - // of available cores. Grow the mask and return it. - mask = make([]uintptr, n+1) - } - mask[n] |= v - if _, _, errno := syscall.RawSyscall( - unix.SYS_SCHED_SETAFFINITY, - uintptr(t.tid), - uintptr(len(mask)*8), - uintptr(unsafe.Pointer(&mask[0]))); errno != 0 { - return errno - } - mask[n] &^= v - maskPool.Put(mask) - return nil -} - -// bind attempts to ensure that the thread is on the same CPU as the current -// thread. This provides no guarantees as it is fundamentally a racy operation: -// CPU sets may change and we may be rescheduled in the middle of this -// operation. As a result, no failures are reported. -// -// Precondition: the current runtime thread should be locked. -func (t *thread) bind() { - currentCPU := hostcpu.GetCPU() - - if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU { - // Set the affinity on the thread and save the CPU for next - // round; we don't expect CPUs to bounce around too frequently. - // - // (It's worth noting that we could move CPUs between this point - // and when the tracee finishes executing. But that would be - // roughly the status quo anyways -- we're just maximizing our - // chances of colocation, not guaranteeing it.) - t.setCPU(currentCPU) - } -} diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go index 0bee995e4..7ee20d89a 100644 --- a/pkg/sentry/platform/ptrace/subprocess_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.16 +// +build !go1.17 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go index 8122ac6e2..87a573cc4 100644 --- a/pkg/sentry/platform/ring0/aarch64.go +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -26,30 +26,31 @@ const ( _PMD_PGT_SIZE = 0x4000 _PTE_PGT_BASE = 0x7000 _PTE_PGT_SIZE = 0x1000 - - _PSR_D_BIT = 0x00000200 - _PSR_A_BIT = 0x00000100 - _PSR_I_BIT = 0x00000080 - _PSR_F_BIT = 0x00000040 ) const ( - // PSR bits - PSR_MODE_EL0t = 0x00000000 - PSR_MODE_EL1t = 0x00000004 - PSR_MODE_EL1h = 0x00000005 - PSR_MODE_MASK = 0x0000000f + // DAIF bits:debug, sError, IRQ, FIQ. + _PSR_D_BIT = 0x00000200 + _PSR_A_BIT = 0x00000100 + _PSR_I_BIT = 0x00000080 + _PSR_F_BIT = 0x00000040 + _PSR_DAIF_SHIFT = 6 + _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT - // KernelFlagsSet should always be set in the kernel. - KernelFlagsSet = PSR_MODE_EL1h + // PSR bits. + _PSR_MODE_EL0t = 0x00000000 + _PSR_MODE_EL1t = 0x00000004 + _PSR_MODE_EL1h = 0x00000005 + _PSR_MODE_MASK = 0x0000000f - // UserFlagsSet are always set in userspace. - UserFlagsSet = PSR_MODE_EL0t + PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK + PsrModeMask = _PSR_MODE_MASK - KernelFlagsClear = PSR_MODE_MASK - UserFlagsClear = PSR_MODE_MASK + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT - PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT + // UserFlagsSet are always set in userspace. + UserFlagsSet = _PSR_MODE_EL0t ) // Vector is an exception vector. diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 9c6c2cf5c..00899273e 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -76,15 +76,41 @@ type KernelOpts struct { type KernelArchState struct { KernelOpts + // cpuEntries is array of kernelEntry for all cpus + cpuEntries []kernelEntry + // globalIDT is our set of interrupt gates. - globalIDT idt64 + globalIDT *idt64 } -// CPUArchState contains CPU-specific arch state. -type CPUArchState struct { +// kernelEntry contains minimal CPU-specific arch state +// that can be mapped at the upper of the address space. +// Malicious APP might steal info from it via CPU bugs. +type kernelEntry struct { // stack is the stack used for interrupts on this CPU. stack [256]byte + // scratch space for temporary usage. + scratch0 uint64 + + // stackTop is the top of the stack. + stackTop uint64 + + // cpuSelf is back reference to CPU. + cpuSelf *CPU + + // kernelCR3 is the cr3 used for sentry kernel. + kernelCR3 uintptr + + // gdt is the CPU's descriptor table. + gdt descriptorTable + + // tss is the CPU's task state. + tss TaskState64 +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { // errorCode is the error code from the last exception. errorCode uintptr @@ -97,11 +123,7 @@ type CPUArchState struct { // exception. errorType uintptr - // gdt is the CPU's descriptor table. - gdt descriptorTable - - // tss is the CPU's task state. - tss TaskState64 + *kernelEntry } // ErrorCode returns the last error code. diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go index 0e2ab716c..508236e46 100644 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -77,6 +77,9 @@ type CPUArchState struct { // lazyVFP is the value of cpacr_el1. lazyVFP uintptr + + // appASID is the asid value of guest application. + appASID uintptr } // ErrorCode returns the last error code. diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go index 7fa43c2f5..d87b1fd00 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.go +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -36,12 +36,15 @@ func sysenter() // This must be called prior to sysret/iret. func swapgs() +// jumpToKernel jumps to the kernel version of the current RIP. +func jumpToKernel() + // sysret returns to userspace from a system call. // // The return code is the vector that interrupted execution. // // See stubs.go for a note regarding the frame size of this function. -func sysret(*CPU, *arch.Registers) Vector +func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector // "iret is the cadillac of CPL switching." // @@ -50,7 +53,7 @@ func sysret(*CPU, *arch.Registers) Vector // iret is nearly identical to sysret, except an iret is used to fully restore // all user state. This must be called in cases where all registers need to be // restored. -func iret(*CPU, *arch.Registers) Vector +func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector // exception is the generic exception entry. // diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index 02df38331..f59747df3 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -63,6 +63,15 @@ MOVQ offset+PTRACE_RSI(reg), SI; \ MOVQ offset+PTRACE_RDI(reg), DI; +// WRITE_CR3() writes the given CR3 value. +// +// The code corresponds to: +// +// mov %rax, %cr3 +// +#define WRITE_CR3() \ + BYTE $0x0f; BYTE $0x22; BYTE $0xd8; + // SWAP_GS swaps the kernel GS (CPU). #define SWAP_GS() \ BYTE $0x0F; BYTE $0x01; BYTE $0xf8; @@ -75,15 +84,9 @@ #define SYSRET64() \ BYTE $0x48; BYTE $0x0f; BYTE $0x07; -// LOAD_KERNEL_ADDRESS loads a kernel address. -#define LOAD_KERNEL_ADDRESS(from, to) \ - MOVQ from, to; \ - ORQ ·KernelStartAddress(SB), to; - // LOAD_KERNEL_STACK loads the kernel stack. -#define LOAD_KERNEL_STACK(from) \ - LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \ - LEAQ CPU_STACK_TOP(SP), SP; +#define LOAD_KERNEL_STACK(entry) \ + MOVQ ENTRY_STACK_TOP(entry), SP; // See kernel.go. TEXT ·Halt(SB),NOSPLIT,$0 @@ -95,58 +98,93 @@ TEXT ·swapgs(SB),NOSPLIT,$0 SWAP_GS() RET +// jumpToKernel changes execution to the kernel address space. +// +// This works by changing the return value to the kernel version. +TEXT ·jumpToKernel(SB),NOSPLIT,$0 + MOVQ 0(SP), AX + ORQ ·KernelStartAddress(SB), AX // Future return value. + MOVQ AX, 0(SP) + RET + // See entry_amd64.go. TEXT ·sysret(SB),NOSPLIT,$0-24 - // Save original state. - LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) - LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + CALL ·jumpToKernel(SB) + // Save original state and stack. sysenter() or exception() + // from APP(gr3) will switch to this stack, set the return + // value (vector: 32(SP)) and then do RET, which will also + // automatically return to the lower half. + MOVQ cpu+0(FP), BX + MOVQ regs+8(FP), AX + MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) + // save SP AX userCR3 on the kernel stack. + MOVQ CPU_ENTRY(BX), BX + LOAD_KERNEL_STACK(BX) + PUSHQ PTRACE_RSP(AX) + PUSHQ PTRACE_RAX(AX) + PUSHQ CX + // Restore user register state. REGISTERS_LOAD(AX, 0) MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. - MOVQ PTRACE_RSP(AX), SP // Restore the stack directly. - MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + + // restore userCR3, AX, SP. + POPQ AX // Get userCR3. + WRITE_CR3() // Switch to userCR3. + POPQ AX // Restore AX. + POPQ SP // Restore SP. SYSRET64() // See entry_amd64.go. TEXT ·iret(SB),NOSPLIT,$0-24 - // Save original state. - LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) - LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + CALL ·jumpToKernel(SB) + // Save original state and stack. sysenter() or exception() + // from APP(gr3) will switch to this stack, set the return + // value (vector: 32(SP)) and then do RET, which will also + // automatically return to the lower half. + MOVQ cpu+0(FP), BX + MOVQ regs+8(FP), AX + MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) // Build an IRET frame & restore state. + MOVQ CPU_ENTRY(BX), BX LOAD_KERNEL_STACK(BX) - MOVQ PTRACE_SS(AX), BX; PUSHQ BX - MOVQ PTRACE_RSP(AX), CX; PUSHQ CX - MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX - MOVQ PTRACE_CS(AX), DI; PUSHQ DI - MOVQ PTRACE_RIP(AX), SI; PUSHQ SI - REGISTERS_LOAD(AX, 0) // Restore most registers. - MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + PUSHQ PTRACE_SS(AX) + PUSHQ PTRACE_RSP(AX) + PUSHQ PTRACE_FLAGS(AX) + PUSHQ PTRACE_CS(AX) + PUSHQ PTRACE_RIP(AX) + PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack. + PUSHQ CX // Save userCR3 on kernel stack. + REGISTERS_LOAD(AX, 0) // Restore most registers. + POPQ AX // Get userCR3. + WRITE_CR3() // Switch to userCR3. + POPQ AX // Restore AX. IRET() // See entry_amd64.go. TEXT ·resume(SB),NOSPLIT,$0 // See iret, above. - MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX - MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX - MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX - MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI - MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI - REGISTERS_LOAD(GS, CPU_REGISTERS) - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + PUSHQ CPU_REGISTERS+PTRACE_SS(AX) + PUSHQ CPU_REGISTERS+PTRACE_RSP(AX) + PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX) + PUSHQ CPU_REGISTERS+PTRACE_CS(AX) + PUSHQ CPU_REGISTERS+PTRACE_RIP(AX) + REGISTERS_LOAD(AX, CPU_REGISTERS) + MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX IRET() // See entry_amd64.go. TEXT ·Start(SB),NOSPLIT,$0 - LOAD_KERNEL_STACK(AX) // Set the stack. PUSHQ $0x0 // Previous frame pointer. MOVQ SP, BP // Set frame pointer. PUSHQ AX // First argument (CPU). @@ -155,53 +193,60 @@ TEXT ·Start(SB),NOSPLIT,$0 // See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT,$0 - // Interrupts are always disabled while we're executing in kernel mode - // and always enabled while executing in user mode. Therefore, we can - // reliably look at the flags in R11 to determine where this syscall - // was from. - TESTL $_RFLAGS_IF, R11 + // _RFLAGS_IOPL0 is always set in the user mode and it is never set in + // the kernel mode. See the comment of UserFlagsSet for more details. + TESTL $_RFLAGS_IOPL0, R11 JZ kernel - user: SWAP_GS() - XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks. - XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs). + MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. + MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. + WRITE_CR3() // Switch to kernel cr3. + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value. - MOVQ BX, PTRACE_RAX(AX) // Save everything else. - MOVQ BX, PTRACE_ORIGRAX(AX) MOVQ CX, PTRACE_RIP(AX) MOVQ R11, PTRACE_FLAGS(AX) - MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX) - MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. - MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + MOVQ SP, PTRACE_RSP(AX) + MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value. + MOVQ CX, PTRACE_RAX(AX) // Save everything else. + MOVQ CX, PTRACE_ORIGRAX(AX) + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks. + MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. + MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. // Return to the kernel, where the frame is: // - // vector (sp+24) + // vector (sp+32) + // userCR3 (sp+24) // regs (sp+16) // cpu (sp+8) // vcpu.Switch (sp+0) // - MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. - MOVQ $Syscall, 24(SP) // Output vector. + MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. + MOVQ $Syscall, 32(SP) // Output vector. RET kernel: // We can't restore the original stack, but we can access the registers // in the CPU state directly. No need for temporary juggling. - MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) - MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) - REGISTERS_SAVE(GS, CPU_REGISTERS) - MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS) - MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS) - MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS) - MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. - MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + MOVQ AX, ENTRY_SCRATCH0(GS) + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + REGISTERS_SAVE(AX, CPU_REGISTERS) + MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX) + MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX) + MOVQ ENTRY_SCRATCH0(GS), BX + MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) + MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) + MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. + MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) - MOVQ CPU_SELF(GS), AX // Load vCPU. PUSHQ AX // First argument (vCPU). CALL ·kernelSyscall(SB) // Call the trampoline. POPQ AX // Pop vCPU. @@ -230,16 +275,21 @@ TEXT ·exception(SB),NOSPLIT,$0 // ERROR_CODE (sp+8) // VECTOR (sp+0) // - TESTL $_RFLAGS_IF, 32(SP) + TESTL $_RFLAGS_IOPL0, 32(SP) JZ kernel user: SWAP_GS() ADDQ $-8, SP // Adjust for flags. MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). - XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for user regs. + PUSHQ AX // Save user AX on stack. + MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. + WRITE_CR3() // Switch to kernel cr3. + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Restore original AX. + POPQ BX // Restore original AX. MOVQ BX, PTRACE_RAX(AX) // Save it. MOVQ BX, PTRACE_ORIGRAX(AX) MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) @@ -249,34 +299,36 @@ user: MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) // Copy out and return. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ 0(SP), BX // Load vector. MOVQ 8(SP), CX // Load error code. - MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version). - MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. - MOVQ CX, CPU_ERROR_CODE(GS) // Set error code. - MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. - MOVQ BX, 24(SP) // Output vector. + MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version). + MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. + MOVQ CX, CPU_ERROR_CODE(AX) // Set error code. + MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. + MOVQ BX, 32(SP) // Output vector. RET kernel: // As per above, we can save directly. - MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) - MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) - REGISTERS_SAVE(GS, CPU_REGISTERS) - MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS) - MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS) - MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS) + PUSHQ AX + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + REGISTERS_SAVE(AX, CPU_REGISTERS) + POPQ BX + MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) + MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) + MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX) + MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX) + MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX) // Set the error code and adjust the stack. - MOVQ 8(SP), AX // Load the error code. - MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU. - MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + MOVQ 8(SP), BX // Load the error code. + MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. + MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. MOVQ 0(SP), BX // BX contains the vector. - ADDQ $48, SP // Drop the exception frame. // Call the exception trampoline. LOAD_KERNEL_STACK(GS) - MOVQ CPU_SELF(GS), AX // Load vCPU. PUSHQ BX // Second argument (vector). PUSHQ AX // First argument (vCPU). CALL ·kernelException(SB) // Call the trampoline. diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s index db6465663..1079a024b 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -27,7 +27,9 @@ // ERET returns using the ELR and SPSR for the current exception level. #define ERET() \ - WORD $0xd69f03e0 + WORD $0xd69f03e0; \ + DSB $7; \ + ISB $15; // RSV_REG is a register that holds el1 information temporarily. #define RSV_REG R18_PLATFORM @@ -40,6 +42,22 @@ #define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) +// sctlr_el1: system control register el1. +#define SCTLR_M 1 << 0 +#define SCTLR_C 1 << 2 +#define SCTLR_I 1 << 12 +#define SCTLR_DZE 1 << 14 +#define SCTLR_UCT 1 << 15 +#define SCTLR_UCI 1 << 26 + +#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT | SCTLR_UCI | SCTLR_DZE) + +// cntkctl_el1: counter-timer kernel control register el1. +#define CNTKCTL_EL0PCTEN 1 << 0 +#define CNTKCTL_EL0VCTEN 1 << 1 + +#define CNTKCTL_EL1_DEFAULT (CNTKCTL_EL0PCTEN | CNTKCTL_EL0VCTEN) + // Saves a register set. // // This is a macro because it may need to executed in contents where a stack is @@ -280,29 +298,27 @@ LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \ MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \ MOVD RSV_REG, RSP; \ - WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 - ISB $15; \ - DSB $15; + WORD $0xd538d092; //MRS TPIDR_EL1, R18 // SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application. #define SWITCH_TO_APP_PAGETABLE(from) \ - MOVD CPU_TTBR0_APP(from), RSV_REG; \ - WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 + MRS TTBR1_EL1, R0; \ + MOVD CPU_APP_ASID(from), R1; \ + BFI $48, R1, $16, R0; \ + MSR R0, TTBR1_EL1; \ // set the ASID in TTBR1_EL1 (since TCR.A1 is set) ISB $15; \ - DSB $15; + MOVD CPU_TTBR0_APP(from), RSV_REG; \ + MSR RSV_REG, TTBR0_EL1; // SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable. #define SWITCH_TO_KVM_PAGETABLE(from) \ - MOVD CPU_TTBR0_KVM(from), RSV_REG; \ - WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 + MRS TTBR1_EL1, R0; \ + MOVD $1, R1; \ + BFI $48, R1, $16, R0; \ + MSR R0, TTBR1_EL1; \ ISB $15; \ - DSB $15; - -#define IRQ_ENABLE \ - MSR $2, DAIFSet; - -#define IRQ_DISABLE \ - MSR $2, DAIFClr; + MOVD CPU_TTBR0_KVM(from), RSV_REG; \ + MSR RSV_REG, TTBR0_EL1; #define VFP_ENABLE \ MOVD $FPEN_ENABLE, R0; \ @@ -318,29 +334,30 @@ #define KERNEL_ENTRY_FROM_EL0 \ SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack. STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ - WORD $0xd538d092; \ //MRS TPIDR_EL1, R18, step2, switch user pagetable. - SWITCH_TO_KVM_PAGETABLE(RSV_REG); \ - WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 - MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer. - REGISTERS_SAVE(RSV_REG_APP, 0); \ // step4, save app context. + WORD $0xd538d092; \ // MRS TPIDR_EL1, R18 + MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step2, load app context pointer. + REGISTERS_SAVE(RSV_REG_APP, 0); \ // step3, save app context. MOVD RSV_REG_APP, R20; \ LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \ ADD $16, RSP, RSP; \ MOVD RSV_REG, PTRACE_R18(R20); \ MOVD RSV_REG_APP, PTRACE_R9(R20); \ - MOVD R20, RSV_REG_APP; \ + MRS TPIDR_EL0, R3; \ + MOVD R3, PTRACE_TLS(R20); \ WORD $0xd5384003; \ // MRS SPSR_EL1, R3 - MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \ + MOVD R3, PTRACE_PSTATE(R20); \ MRS ELR_EL1, R3; \ - MOVD R3, PTRACE_PC(RSV_REG_APP); \ + MOVD R3, PTRACE_PC(R20); \ WORD $0xd5384103; \ // MRS SP_EL0, R3 - MOVD R3, PTRACE_SP(RSV_REG_APP); + MOVD R3, PTRACE_SP(R20); // KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1. #define KERNEL_ENTRY_FROM_EL1 \ WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // Save sentry context. MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \ + MRS TPIDR_EL0, R4; \ + MOVD R4, CPU_REGISTERS+PTRACE_TLS(RSV_REG); \ WORD $0xd5384004; \ // MRS SPSR_EL1, R4 MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \ MRS ELR_EL1, R4; \ @@ -349,6 +366,26 @@ MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. +// EXCEPTION_WITH_ERROR is a common exception handler function. +#define EXCEPTION_WITH_ERROR(user, vector) \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 + WORD $0xd538601a; \ //MRS FAR_EL1, R26 + MOVD R26, CPU_FAULT_ADDR(RSV_REG); \ + MOVD $user, R3; \ + MOVD R3, CPU_ERROR_TYPE(RSV_REG); \ // Set error type to user. + MOVD $vector, R3; \ + MOVD R3, CPU_VECTOR_CODE(RSV_REG); \ + MRS ESR_EL1, R3; \ + MOVD R3, CPU_ERROR_CODE(RSV_REG); \ + B ·kernelExitToEl1(SB); + +// storeAppASID writes the application's asid value. +TEXT ·storeAppASID(SB),NOSPLIT,$0-8 + MOVD asid+0(FP), R1 + MRS TPIDR_EL1, RSV_REG + MOVD R1, CPU_APP_ASID(RSV_REG) + RET + // Halt halts execution. TEXT ·Halt(SB),NOSPLIT,$0 // Clear bluepill. @@ -356,15 +393,24 @@ TEXT ·Halt(SB),NOSPLIT,$0 CMP RSV_REG, R9 BNE mmio_exit MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG) + mmio_exit: // Disable fpsimd. WORD $0xd5381041 // MRS CPACR_EL1, R1 MOVD R1, CPU_LAZY_VFP(RSV_REG) VFP_DISABLE - // MMIO_EXIT. - MOVD $0, R9 - MOVD R0, 0xffff000000001000(R9) + // Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT. + // + // To keep it simple, I used the address of exception table as the + // MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing + // a read-only space. + // Also, the length is engough to match a sufficient number of hypercall ID. + // Then, in host user space, I can calculate this address to find out + // which hypercall. + MRS VBAR_EL1, R9 + MOVD R0, 0x0(R9) + RET // HaltAndResume halts execution and point the pointer to the resume function. @@ -392,14 +438,17 @@ TEXT ·Current(SB),NOSPLIT,$0-8 MOVD R8, ret+0(FP) RET -#define STACK_FRAME_SIZE 16 +#define STACK_FRAME_SIZE 32 // kernelExitToEl0 is the entrypoint for application in guest_el0. // Prepare the vcpu environment for container application. TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 // Step1, save sentry context into memory. + MRS TPIDR_EL1, RSV_REG REGISTERS_SAVE(RSV_REG, CPU_REGISTERS) MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG) + MRS TPIDR_EL0, R3 + MOVD R3, CPU_REGISTERS+PTRACE_TLS(RSV_REG) WORD $0xd5384003 // MRS SPSR_EL1, R3 MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG) @@ -409,34 +458,13 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3 - // Step2, save SP_EL1, PSTATE into kernel temporary stack. - // switch to temporary stack. + // Step2, switch to temporary stack. LOAD_KERNEL_STACK(RSV_REG) - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - - SUB $STACK_FRAME_SIZE, RSP, RSP - MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R11 - MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R12 - STP (R11, R12), 16*0(RSP) - - MOVD CPU_REGISTERS+PTRACE_R11(RSV_REG), R11 - MOVD CPU_REGISTERS+PTRACE_R12(RSV_REG), R12 - - // Step3, test user pagetable. - // If user pagetable is empty, trapped in el1_ia. - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - SWITCH_TO_APP_PAGETABLE(RSV_REG) - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - SWITCH_TO_KVM_PAGETABLE(RSV_REG) - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - - // If pagetable is not empty, recovery kernel temporary stack. - ADD $STACK_FRAME_SIZE, RSP, RSP - // Step4, load app context pointer. + // Step3, load app context pointer. MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP - // Step5, prepare the environment for container application. + // Step4, prepare the environment for container application. // set sp_el0. MOVD PTRACE_SP(RSV_REG_APP), R1 WORD $0xd5184101 //MSR R1, SP_EL0 @@ -447,8 +475,18 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 MOVD PTRACE_PSTATE(RSV_REG_APP), R1 WORD $0xd5184001 //MSR R1, SPSR_EL1 + // need use kernel space address to excute below code, since + // after SWITCH_TO_APP_PAGETABLE the ASID is changed to app's + // ASID. + WORD $0x10000061 // ADR R1, do_exit_to_el0 + ORR $0xffff000000000000, R1, R1 + JMP (R1) + +do_exit_to_el0: // RSV_REG & RSV_REG_APP will be loaded at the end. REGISTERS_LOAD(RSV_REG_APP, 0) + MOVD PTRACE_TLS(RSV_REG_APP), RSV_REG + MSR RSV_REG, TPIDR_EL0 // switch to user pagetable. MOVD PTRACE_R18(RSV_REG_APP), RSV_REG @@ -456,11 +494,13 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 SUB $STACK_FRAME_SIZE, RSP, RSP STP (RSV_REG, RSV_REG_APP), 16*0(RSP) + STP (R0, R1), 16*1(RSP) WORD $0xd538d092 //MRS TPIDR_EL1, R18 SWITCH_TO_APP_PAGETABLE(RSV_REG) + LDP 16*1(RSP), (R0, R1) LDP 16*0(RSP), (RSV_REG, RSV_REG_APP) ADD $STACK_FRAME_SIZE, RSP, RSP @@ -470,7 +510,6 @@ TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 // Prepare the vcpu environment for sentry. TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 WORD $0xd538d092 //MRS TPIDR_EL1, R18 - MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1 WORD $0xd5184001 //MSR R1, SPSR_EL1 @@ -480,6 +519,9 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1 MOVD R1, RSP + SWITCH_TO_KVM_PAGETABLE(RSV_REG) + MRS TPIDR_EL1, RSV_REG + REGISTERS_LOAD(RSV_REG, CPU_REGISTERS) MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP @@ -487,7 +529,13 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 // Start is the CPU entrypoint. TEXT ·Start(SB),NOSPLIT,$0 - IRQ_DISABLE + // Init. + MOVD $SCTLR_EL1_DEFAULT, R1 + MSR R1, SCTLR_EL1 + + MOVD $CNTKCTL_EL1_DEFAULT, R1 + MSR R1, CNTKCTL_EL1 + MOVD R8, RSV_REG ORR $0xffff000000000000, RSV_REG, RSV_REG WORD $0xd518d092 //MSR R18, TPIDR_EL1 @@ -536,6 +584,7 @@ TEXT ·El1_sync(SB),NOSPLIT,$0 B el1_invalid el1_da: +el1_ia: WORD $0xd538d092 //MRS TPIDR_EL1, R18 WORD $0xd538601a //MRS FAR_EL1, R26 @@ -548,9 +597,6 @@ el1_da: B ·HaltAndResume(SB) -el1_ia: - B ·HaltAndResume(SB) - el1_sp_pc: B ·Shutdown(SB) @@ -622,24 +668,11 @@ el0_svc: MOVD $Syscall, R3 MOVD R3, CPU_VECTOR_CODE(RSV_REG) - B ·HaltAndResume(SB) + B ·kernelExitToEl1(SB) el0_da: - WORD $0xd538d092 //MRS TPIDR_EL1, R18 - WORD $0xd538601a //MRS FAR_EL1, R26 - - MOVD R26, CPU_FAULT_ADDR(RSV_REG) - - MOVD $1, R3 - MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. - - MOVD $PageFault, R3 - MOVD R3, CPU_VECTOR_CODE(RSV_REG) - - B ·HaltAndResume(SB) - el0_ia: - B ·Shutdown(SB) + EXCEPTION_WITH_ERROR(1, PageFault) el0_fpsimd_acc: B ·Shutdown(SB) @@ -654,7 +687,7 @@ el0_sp_pc: B ·Shutdown(SB) el0_undef: - B ·Shutdown(SB) + EXCEPTION_WITH_ERROR(1, El0Sync_undef) el0_dbg: B ·Shutdown(SB) diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 549f3d228..9742308d8 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -24,7 +24,10 @@ go_binary( "defs_impl_arm64.go", "main.go", ], - visibility = ["//pkg/sentry/platform/ring0:__pkg__"], + visibility = [ + "//pkg/sentry/platform/kvm:__pkg__", + "//pkg/sentry/platform/ring0:__pkg__", + ], deps = [ "//pkg/cpuid", "//pkg/sentry/arch", diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go index 900c0bba7..264be23d3 100644 --- a/pkg/sentry/platform/ring0/kernel.go +++ b/pkg/sentry/platform/ring0/kernel.go @@ -19,8 +19,8 @@ package ring0 // N.B. that constraints on KernelOpts must be satisfied. // //go:nosplit -func (k *Kernel) Init(opts KernelOpts) { - k.init(opts) +func (k *Kernel) Init(opts KernelOpts, maxCPUs int) { + k.init(opts, maxCPUs) } // Halt halts execution. @@ -31,31 +31,57 @@ type defaultHooks struct{} // KernelSyscall implements Hooks.KernelSyscall. // +// +checkescape:all +// //go:nosplit -func (defaultHooks) KernelSyscall() { Halt() } +func (defaultHooks) KernelSyscall() { + Halt() +} // KernelException implements Hooks.KernelException. // +// +checkescape:all +// //go:nosplit -func (defaultHooks) KernelException(Vector) { Halt() } +func (defaultHooks) KernelException(Vector) { + Halt() +} // kernelSyscall is a trampoline. // +// When in amd64, it is called with %rip on the upper half, so it can +// NOT access to any global data which is not mapped on upper and must +// call to function pointers or interfaces to switch to the lower half +// so that callee can access to global data. +// +// +checkescape:hard,stack +// //go:nosplit -func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() } +func kernelSyscall(c *CPU) { + c.hooks.KernelSyscall() +} // kernelException is a trampoline. // +// When in amd64, it is called with %rip on the upper half, so it can +// NOT access to any global data which is not mapped on upper and must +// call to function pointers or interfaces to switch to the lower half +// so that callee can access to global data. +// +// +checkescape:hard,stack +// //go:nosplit -func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) } +func kernelException(c *CPU, vector Vector) { + c.hooks.KernelException(vector) +} // Init initializes a new CPU. // // Init allows embedding in other objects. -func (c *CPU) Init(k *Kernel, hooks Hooks) { - c.self = c // Set self reference. - c.kernel = k // Set kernel reference. - c.init() // Perform architectural init. +func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) { + c.self = c // Set self reference. + c.kernel = k // Set kernel reference. + c.init(cpuID) // Perform architectural init. // Require hooks. if hooks != nil { diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 0feff8778..3a9dff4cc 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -18,13 +18,42 @@ package ring0 import ( "encoding/binary" + "reflect" + + "gvisor.dev/gvisor/pkg/usermem" ) // init initializes architecture-specific state. -func (k *Kernel) init(opts KernelOpts) { +func (k *Kernel) init(opts KernelOpts, maxCPUs int) { // Save the root page tables. k.PageTables = opts.PageTables + entrySize := reflect.TypeOf(kernelEntry{}).Size() + var ( + entries []kernelEntry + padding = 1 + ) + for { + entries = make([]kernelEntry, maxCPUs+padding-1) + totalSize := entrySize * uintptr(maxCPUs+padding-1) + addr := reflect.ValueOf(&entries[0]).Pointer() + if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize { + // The runtime forces power-of-2 alignment for allocations, and we are therefore + // safe once the first address is aligned and the chunk is at least a full page. + break + } + padding = padding << 1 + } + k.cpuEntries = entries + + k.globalIDT = &idt64{} + if reflect.TypeOf(idt64{}).Size() != usermem.PageSize { + panic("Size of globalIDT should be PageSize") + } + if reflect.ValueOf(k.globalIDT).Pointer()&(usermem.PageSize-1) != 0 { + panic("Allocated globalIDT should be page aligned") + } + // Setup the IDT, which is uniform. for v, handler := range handlers { // Allow Breakpoint and Overflow to be called from all @@ -39,8 +68,26 @@ func (k *Kernel) init(opts KernelOpts) { } } +func (k *Kernel) EntryRegions() map[uintptr]uintptr { + regions := make(map[uintptr]uintptr) + + addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer() + size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries)) + end, _ := usermem.Addr(addr + size).RoundUp() + regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end) + + addr = reflect.ValueOf(k.globalIDT).Pointer() + size = reflect.TypeOf(idt64{}).Size() + end, _ = usermem.Addr(addr + size).RoundUp() + regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end) + + return regions +} + // init initializes architecture-specific state. -func (c *CPU) init() { +func (c *CPU) init(cpuID int) { + c.kernelEntry = &c.kernel.cpuEntries[cpuID] + c.cpuSelf = c // Null segment. c.gdt[0].setNull() @@ -65,6 +112,7 @@ func (c *CPU) init() { // Set the kernel stack pointer in the TSS (virtual address). stackAddr := c.StackTop() + c.stackTop = stackAddr c.tss.rsp0Lo = uint32(stackAddr) c.tss.rsp0Hi = uint32(stackAddr >> 32) c.tss.ist1Lo = uint32(stackAddr) @@ -178,10 +226,12 @@ func IsCanonical(addr uint64) bool { // // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. // +// +checkescape:all +// //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) - kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID) + c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) // Sanitize registers. regs := switchOpts.Registers @@ -192,20 +242,16 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { // Perform the switch. swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // Set application FS. - WriteGS(uintptr(regs.Gs_base)) // Set application GS. - LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point. - jumpToKernel() // Switch to upper half. - writeCR3(uintptr(userCR3)) // Change to user address space. + WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. + WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. + LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. if switchOpts.FullRestore { - vector = iret(c, regs) + vector = iret(c, regs, uintptr(userCR3)) } else { - vector = sysret(c, regs) + vector = sysret(c, regs, uintptr(userCR3)) } - writeCR3(uintptr(kernelCR3)) // Return to kernel address space. - jumpToUser() // Return to lower half. - SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // Restore kernel FS. + SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point. + WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. return } @@ -217,7 +263,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { //go:nosplit func start(c *CPU) { // Save per-cpu & FS segment. - WriteGS(kernelAddr(c)) + WriteGS(kernelAddr(c.kernelEntry)) WriteFS(uintptr(c.registers.Fs_base)) // Initialize floating point. diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index ccacaea6b..b294ccc7c 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -25,13 +25,13 @@ func HaltAndResume() func HaltEl1SvcAndResume() // init initializes architecture-specific state. -func (k *Kernel) init(opts KernelOpts) { +func (k *Kernel) init(opts KernelOpts, maxCPUs int) { // Save the root page tables. k.PageTables = opts.PageTables } // init initializes architecture-specific state. -func (c *CPU) init() { +func (c *CPU) init(cpuID int) { // Set the kernel stack pointer(virtual address). c.registers.Sp = uint64(c.StackTop()) @@ -53,14 +53,23 @@ func IsCanonical(addr uint64) bool { //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { - // Sanitize registers. + storeAppASID(uintptr(switchOpts.UserASID)) + if switchOpts.Flush { + FlushTlbAll() + } + regs := switchOpts.Registers - regs.Pstate &= ^uint64(UserFlagsClear) + regs.Pstate &= ^uint64(PsrFlagsClear) regs.Pstate |= UserFlagsSet + + LoadFloatingPoint(switchOpts.FloatingPointState) + kernelExitToEl0() + + SaveFloatingPoint(switchOpts.FloatingPointState) + vector = c.vecCode - // Perform the switch. return } diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go index ca968a036..0ec5c3bc5 100644 --- a/pkg/sentry/platform/ring0/lib_amd64.go +++ b/pkg/sentry/platform/ring0/lib_amd64.go @@ -61,21 +61,9 @@ func wrgsbase(addr uintptr) // wrgsmsr writes to the GS_BASE MSR. func wrgsmsr(addr uintptr) -// writeCR3 writes the CR3 value. -func writeCR3(phys uintptr) - -// readCR3 reads the current CR3 value. -func readCR3() uintptr - // readCR2 reads the current CR2 value. func readCR2() uintptr -// jumpToKernel jumps to the kernel version of the current RIP. -func jumpToKernel() - -// jumpToUser jumps to the user version of the current RIP. -func jumpToUser() - // fninit initializes the floating point unit. func fninit() diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s index 75d742750..2fe83568a 100644 --- a/pkg/sentry/platform/ring0/lib_amd64.s +++ b/pkg/sentry/platform/ring0/lib_amd64.s @@ -127,53 +127,6 @@ TEXT ·wrgsmsr(SB),NOSPLIT,$0-8 BYTE $0x0f; BYTE $0x30; // WRMSR RET -// jumpToUser changes execution to the user address. -// -// This works by changing the return value to the user version. -TEXT ·jumpToUser(SB),NOSPLIT,$0 - MOVQ 0(SP), AX - MOVQ ·KernelStartAddress(SB), BX - NOTQ BX - ANDQ BX, SP // Switch the stack. - ANDQ BX, BP // Switch the frame pointer. - ANDQ BX, AX // Future return value. - MOVQ AX, 0(SP) - RET - -// jumpToKernel changes execution to the kernel address space. -// -// This works by changing the return value to the kernel version. -TEXT ·jumpToKernel(SB),NOSPLIT,$0 - MOVQ 0(SP), AX - MOVQ ·KernelStartAddress(SB), BX - ORQ BX, SP // Switch the stack. - ORQ BX, BP // Switch the frame pointer. - ORQ BX, AX // Future return value. - MOVQ AX, 0(SP) - RET - -// writeCR3 writes the given CR3 value. -// -// The code corresponds to: -// -// mov %rax, %cr3 -// -TEXT ·writeCR3(SB),NOSPLIT,$0-8 - MOVQ cr3+0(FP), AX - BYTE $0x0f; BYTE $0x22; BYTE $0xd8; - RET - -// readCR3 reads the current CR3 value. -// -// The code corresponds to: -// -// mov %cr3, %rax -// -TEXT ·readCR3(SB),NOSPLIT,$0-8 - BYTE $0x0f; BYTE $0x20; BYTE $0xd8; - MOVQ AX, ret+0(FP) - RET - // readCR2 reads the current CR2 value. // // The code corresponds to: diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index a6345010d..d91a09de1 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -16,6 +16,15 @@ package ring0 +// storeAppASID writes the application's asid value. +func storeAppASID(asid uintptr) + +// LocalFlushTlbAll same as FlushTlbAll, but only applies to the calling CPU. +func LocalFlushTlbAll() + +// FlushTlbAll flush all tlb. +func FlushTlbAll() + // CPACREL1 returns the value of the CPACR_EL1 register. func CPACREL1() (value uintptr) @@ -38,11 +47,11 @@ func SaveVRegs(*byte) // LoadVRegs loads V0-V31 registers. func LoadVRegs(*byte) -// GetTLS returns the value of TPIDR_EL0 register. -func GetTLS() (value uint64) +// LoadFloatingPoint loads floating point state. +func LoadFloatingPoint(*byte) -// SetTLS writes the TPIDR_EL0 value. -func SetTLS(value uint64) +// SaveFloatingPoint saves floating point state. +func SaveFloatingPoint(*byte) // Init sets function pointers based on architectural features. // diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s index b63e14b41..da9d3cf55 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -15,14 +15,18 @@ #include "funcdata.h" #include "textflag.h" -TEXT ·GetTLS(SB),NOSPLIT,$0-8 - MRS TPIDR_EL0, R1 - MOVD R1, ret+0(FP) +TEXT ·LocalFlushTlbAll(SB),NOSPLIT,$0 + DSB $6 // dsb(nshst) + WORD $0xd508871f // __tlbi(vmalle1) + DSB $7 // dsb(nsh) + ISB $15 RET -TEXT ·SetTLS(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R1 - MSR R1, TPIDR_EL0 +TEXT ·FlushTlbAll(SB),NOSPLIT,$0 + DSB $10 // dsb(ishst) + WORD $0xd508831f // __tlbi(vmalle1is) + DSB $11 // dsb(ish) + ISB $15 RET TEXT ·CPACREL1(SB),NOSPLIT,$0-8 @@ -129,3 +133,89 @@ TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 ISB $15 RET + +TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + MOVD 0(R0), R1 + MOVD R1, FPSR + MOVD 8(R0), R1 + MOVD R1, NZCV + + FMOVD 16*1(R0), F0 + FMOVD 16*2(R0), F1 + FMOVD 16*3(R0), F2 + FMOVD 16*4(R0), F3 + FMOVD 16*5(R0), F4 + FMOVD 16*6(R0), F5 + FMOVD 16*7(R0), F6 + FMOVD 16*8(R0), F7 + FMOVD 16*9(R0), F8 + FMOVD 16*10(R0), F9 + FMOVD 16*11(R0), F10 + FMOVD 16*12(R0), F11 + FMOVD 16*13(R0), F12 + FMOVD 16*14(R0), F13 + FMOVD 16*15(R0), F14 + FMOVD 16*16(R0), F15 + FMOVD 16*17(R0), F16 + FMOVD 16*18(R0), F17 + FMOVD 16*19(R0), F18 + FMOVD 16*20(R0), F19 + FMOVD 16*21(R0), F20 + FMOVD 16*22(R0), F21 + FMOVD 16*23(R0), F22 + FMOVD 16*24(R0), F23 + FMOVD 16*25(R0), F24 + FMOVD 16*26(R0), F25 + FMOVD 16*27(R0), F26 + FMOVD 16*28(R0), F27 + FMOVD 16*29(R0), F28 + FMOVD 16*30(R0), F29 + FMOVD 16*31(R0), F30 + FMOVD 16*32(R0), F31 + + RET + +TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + MOVD FPSR, R1 + MOVD R1, 0(R0) + MOVD NZCV, R1 + MOVD R1, 8(R0) + + FMOVD F0, 16*1(R0) + FMOVD F1, 16*2(R0) + FMOVD F2, 16*3(R0) + FMOVD F3, 16*4(R0) + FMOVD F4, 16*5(R0) + FMOVD F5, 16*6(R0) + FMOVD F6, 16*7(R0) + FMOVD F7, 16*8(R0) + FMOVD F8, 16*9(R0) + FMOVD F9, 16*10(R0) + FMOVD F10, 16*11(R0) + FMOVD F11, 16*12(R0) + FMOVD F12, 16*13(R0) + FMOVD F13, 16*14(R0) + FMOVD F14, 16*15(R0) + FMOVD F15, 16*16(R0) + FMOVD F16, 16*17(R0) + FMOVD F17, 16*18(R0) + FMOVD F18, 16*19(R0) + FMOVD F19, 16*20(R0) + FMOVD F20, 16*21(R0) + FMOVD F21, 16*22(R0) + FMOVD F22, 16*23(R0) + FMOVD F23, 16*24(R0) + FMOVD F24, 16*25(R0) + FMOVD F25, 16*26(R0) + FMOVD F26, 16*27(R0) + FMOVD F27, 16*28(R0) + FMOVD F28, 16*29(R0) + FMOVD F29, 16*30(R0) + FMOVD F30, 16*31(R0) + FMOVD F31, 16*32(R0) + + RET diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go index b8ab120a0..ca4075b09 100644 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -30,14 +30,21 @@ func Emit(w io.Writer) { c := &CPU{} fmt.Fprintf(w, "\n// CPU offsets.\n") - fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) + + e := &kernelEntry{} + fmt.Fprintf(w, "\n// CPU entry offsets.\n") + fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer()) fmt.Fprintf(w, "\n// Bits.\n") fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) + fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0) fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) fmt.Fprintf(w, "\n// Vectors.\n") diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go index f3de962f0..45eba960d 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -41,6 +41,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_APP_ASID 0x%02x\n", reflect.ValueOf(&c.appASID).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "\n// Bits.\n") fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) @@ -124,4 +125,5 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define PTRACE_SP 0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_PC 0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_PSTATE 0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_TLS 0x%02x\n", reflect.ValueOf(&p.TPIDR_EL0).Pointer()-reflect.ValueOf(p).Pointer()) } diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go index 23fd5c352..8d75b7599 100644 --- a/pkg/sentry/platform/ring0/pagetables/allocator.go +++ b/pkg/sentry/platform/ring0/pagetables/allocator.go @@ -53,9 +53,14 @@ type RuntimeAllocator struct { // NewRuntimeAllocator returns an allocator that uses runtime allocation. func NewRuntimeAllocator() *RuntimeAllocator { - return &RuntimeAllocator{ - used: make(map[*PTEs]struct{}), - } + r := new(RuntimeAllocator) + r.Init() + return r +} + +// Init initializes a RuntimeAllocator. +func (r *RuntimeAllocator) Init() { + r.used = make(map[*PTEs]struct{}) } // Recycle returns freed pages to the pool. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index 87e88e97d..7f18ac296 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -86,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true } // // Precondition: addr & length must be page-aligned, their sum must not overflow. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { if !opts.AccessType.Any() { @@ -128,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { // // Precondition: addr & length must be page-aligned. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { w := unmapWalker{ @@ -162,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { // // Precondition: addr & length must be page-aligned. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { w := emptyWalker{ @@ -197,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false } // Lookup returns the physical address for the given virtual address. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { mask := uintptr(usermem.PageSize - 1) diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go index 78510ebed..520161755 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go @@ -72,13 +72,14 @@ const ( ) const ( - mtNormal = 0x4 << 2 + mtDevicenGnRE = 0x1 << 2 + mtNormal = 0x4 << 2 ) const ( executeDisable = xn - optionMask = 0xfff | 0xfff<<48 - protDefault = accessed | shared | mtNormal + optionMask = 0xfff | 0xffff<<48 + protDefault = accessed | shared ) // MapOpts are x86 options. @@ -184,8 +185,10 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) { if opts.User { v |= user + v |= mtNormal } else { v = v &^ user + v |= mtNormal } atomic.StoreUintptr((*uintptr)(p), v) } @@ -200,7 +203,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { // This should never happen. panic("unaligned physical address!") } - v := addr | typeTable | protDefault + v := addr | typeTable | protDefault | mtNormal atomic.StoreUintptr((*uintptr)(p), v) } diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go index 1a49f12a2..5ddd10256 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go @@ -36,7 +36,7 @@ const ( pudSize = 1 << pudShift pgdSize = 1 << pgdShift - ttbrASIDOffset = 55 + ttbrASIDOffset = 48 ttbrASIDMask = 0xff entriesPerPage = 512 diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go index 9da0ea685..34fbc1c35 100644 --- a/pkg/sentry/platform/ring0/x86.go +++ b/pkg/sentry/platform/ring0/x86.go @@ -39,7 +39,9 @@ const ( _RFLAGS_AC = 1 << 18 _RFLAGS_NT = 1 << 14 - _RFLAGS_IOPL = 3 << 12 + _RFLAGS_IOPL0 = 1 << 12 + _RFLAGS_IOPL1 = 1 << 13 + _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1 _RFLAGS_DF = 1 << 10 _RFLAGS_IF = 1 << 9 _RFLAGS_STEP = 1 << 8 @@ -67,15 +69,45 @@ const ( KernelFlagsSet = _RFLAGS_RESERVED // UserFlagsSet are always set in userspace. - UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF + // + // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege + // level. The Current Privilege Level (CPL) of the task must be less + // than or equal to the IOPL in order for the task or program to access + // I/O ports. + // + // Here, _RFLAGS_IOPL0 is used only to determine whether the task is + // running in the kernel or userspace mode. In the user mode, the CPL is + // always 3 and it doesn't matter what IOPL is set if it is bellow CPL. + // + // We need to have one bit which will be always different in user and + // kernel modes. And we have to remember that even though we have + // KernelFlagsClear, we still can see some of these flags in the kernel + // mode. This can happen when the goruntime switches on a goroutine + // which has been saved in the host mode. On restore, the popf + // instruction is used to restore flags and this means that all flags + // what the goroutine has in the host mode will be restored in the + // kernel mode. + // + // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set + // it in the user mode. So if this flag is set, the task is running in + // the user mode and if it isn't set, the task is running in the kernel + // mode. + UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0 // KernelFlagsClear should always be clear in the kernel. KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT // UserFlagsClear are always cleared in userspace. - UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL + UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1 ) +// IsKernelFlags returns true if rflags coresponds to the kernel mode. +// +// go:nosplit +func IsKernelFlags(rflags uint64) bool { + return rflags&_RFLAGS_IOPL0 == 0 +} + // Vector is an exception vector. type Vector uintptr @@ -104,7 +136,7 @@ const ( VirtualizationException SecurityException = 0x1e SyscallInt80 = 0x80 - _NR_INTERRUPTS = SyscallInt80 + 1 + _NR_INTERRUPTS = 0x100 ) // System call vectors. diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index c40c6d673..a3f775d15 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -10,6 +10,7 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/context", + "//pkg/marshal", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 8b439a078..70ccf77a7 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -68,7 +68,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { for _, fd := range fds { file := t.GetFile(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -100,9 +100,9 @@ func (fs *RightsFiles) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFiles) Release() { +func (fs *RightsFiles) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -115,7 +115,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go index fd08179be..37d02948f 100644 --- a/pkg/sentry/socket/control/control_vfs2.go +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -24,6 +24,8 @@ import ( ) // SCMRightsVFS2 represents a SCM_RIGHTS socket control message. +// +// +stateify savable type SCMRightsVFS2 interface { transport.RightsControlMessage @@ -34,9 +36,11 @@ type SCMRightsVFS2 interface { Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool) } -// RightsFiles represents a SCM_RIGHTS socket control message. A reference is -// maintained for each vfs.FileDescription and is release either when an FD is created or -// when the Release method is called. +// RightsFilesVFS2 represents a SCM_RIGHTS socket control message. A reference +// is maintained for each vfs.FileDescription and is release either when an FD +// is created or when the Release method is called. +// +// +stateify savable type RightsFilesVFS2 []*vfs.FileDescription // NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message @@ -46,7 +50,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { for _, fd := range fds { file := t.GetFileVFS2(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -78,9 +82,9 @@ func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFilesVFS2) Release() { +func (fs *RightsFilesVFS2) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -93,7 +97,7 @@ func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index e82d6cd1e..b6ebe29d6 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -21,11 +21,14 @@ go_library( "//pkg/context", "//pkg/fdnotifier", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", @@ -36,6 +39,9 @@ go_library( "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c11e82c10..7d3c4a01c 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -24,6 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -98,12 +100,12 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc return nil, syserr.FromError(err) } dirent := socket.NewDirent(ctx, socketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } @@ -267,7 +269,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, @@ -279,7 +281,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, @@ -319,12 +321,12 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } - // Whitelist options and constrain option length. + // Only allow known and safe options. optlen := getSockOptLen(t, level, name) switch level { case linux.SOL_IP: @@ -364,12 +366,13 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr if err != nil { return nil, syserr.FromError(err) } - return opt, nil + optP := primitive.ByteSlice(opt) + return &optP, nil } // SetSockOpt implements socket.Socket.SetSockOpt. func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { - // Whitelist options and constrain option length. + // Only allow known and safe options. optlen := setSockOptLen(t, level, name) switch level { case linux.SOL_IP: @@ -415,7 +418,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] // RecvMsg implements socket.Socket.RecvMsg. func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { - // Whitelist flags. + // Only allow known and safe flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary // messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the @@ -537,7 +540,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // SendMsg implements socket.Socket.SendMsg. func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { - // Whitelist flags. + // Only allow known and safe flags. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { return 0, syserr.ErrInvalidArgument } @@ -708,6 +711,6 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int func init() { for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { socket.RegisterProvider(family, &socketProvider{family}) - socket.RegisterProviderVFS2(family, &socketProviderVFS2{}) + socket.RegisterProviderVFS2(family, &socketProviderVFS2{family}) } } diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 677743113..9a2cac40b 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -32,9 +33,11 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// +stateify savable type socketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD // We store metadata for hostinet sockets internally. Technically, we should // access metadata (e.g. through stat, chmod) on the host for correctness, @@ -49,7 +52,8 @@ var _ = socket.SocketVFS2(&socketVFS2{}) func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - d := sockfs.NewDentry(t.Credentials(), mnt) + d := sockfs.NewDentry(t, mnt) + defer d.DecRef(t) s := &socketVFS2{ socketOpsCommon: socketOpsCommon{ @@ -59,6 +63,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in fd: fd, }, } + s.LockFD.Init(&vfs.FileLocks{}) if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } @@ -68,11 +73,19 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in DenyPWrite: true, UseDentryMetadata: true, }); err != nil { + fdnotifier.RemoveFD(int32(s.fd)) return nil, syserr.FromError(err) } return vfsfd, nil } +// Release implements vfs.FileDescriptionImpl.Release. +func (s *socketVFS2) Release(ctx context.Context) { + t := kernel.TaskFromContext(ctx) + t.Kernel().DeleteSocketVFS2(&s.vfsfd) + s.socketOpsCommon.Release(ctx) +} + // Readiness implements waiter.Waitable.Readiness. func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { return s.socketOpsCommon.Readiness(mask) @@ -93,7 +106,7 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal return ioctl(ctx, s.fd, uio, args) } -// PRead implements vfs.FileDescriptionImpl. +// PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, syserror.ESPIPE } @@ -131,6 +144,16 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return int64(n), err } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + type socketProviderVFS2 struct { family int } diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index a48082631..7e7857ac3 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -30,6 +30,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -53,11 +56,14 @@ type Stack struct { interfaceAddrs map[int32][]inet.InterfaceAddr routes []inet.Route supportsIPv6 bool + tcpRecovery inet.TCPLossRecovery tcpRecvBufSize inet.TCPBufferSize tcpSendBufSize inet.TCPBufferSize tcpSACKEnabled bool netDevFile *os.File netSNMPFile *os.File + ipv4Forwarding bool + ipv6Forwarding bool } // NewStack returns an empty Stack containing no configuration. @@ -117,6 +123,13 @@ func (s *Stack) Configure() error { s.netSNMPFile = f } + s.ipv6Forwarding = false + if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil { + s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0" + } else { + log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false") + } + return nil } @@ -311,7 +324,12 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { } // AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. -func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { +func (s *Stack) AddInterfaceAddr(int32, inet.InterfaceAddr) error { + return syserror.EACCES +} + +// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. +func (s *Stack) RemoveInterfaceAddr(int32, inet.InterfaceAddr) error { return syserror.EACCES } @@ -346,7 +364,17 @@ func (s *Stack) TCPSACKEnabled() (bool, error) { } // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. -func (s *Stack) SetTCPSACKEnabled(enabled bool) error { +func (s *Stack) SetTCPSACKEnabled(bool) error { + return syserror.EACCES +} + +// TCPRecovery implements inet.Stack.TCPRecovery. +func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { + return s.tcpRecovery, nil +} + +// SetTCPRecovery implements inet.Stack.SetTCPRecovery. +func (s *Stack) SetTCPRecovery(inet.TCPLossRecovery) error { return syserror.EACCES } @@ -407,18 +435,18 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { } if rawLine == "" { - return fmt.Errorf("Failed to get raw line") + return fmt.Errorf("failed to get raw line") } parts := strings.SplitN(rawLine, ":", 2) if len(parts) != 2 { - return fmt.Errorf("Failed to get prefix from: %q", rawLine) + return fmt.Errorf("failed to get prefix from: %q", rawLine) } sliceStat = toSlice(stat) fields := strings.Fields(strings.TrimSpace(parts[1])) if len(fields) != len(sliceStat) { - return fmt.Errorf("Failed to parse fields: %q", rawLine) + return fmt.Errorf("failed to parse fields: %q", rawLine) } if _, ok := stat.(*inet.StatSNMPTCP); ok { snmpTCP = true @@ -434,7 +462,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { sliceStat[i], err = strconv.ParseUint(fields[i], 10, 64) } if err != nil { - return fmt.Errorf("Failed to parse field %d from: %q, %v", i, rawLine, err) + return fmt.Errorf("failed to parse field %d from: %q, %v", i, rawLine, err) } } @@ -457,3 +485,21 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} + +// Forwarding implements inet.Stack.Forwarding. +func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { + switch protocol { + case ipv4.ProtocolNumber: + return s.ipv4Forwarding + case ipv6.ProtocolNumber: + return s.ipv6Forwarding + default: + log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol) + return false + } +} + +// SetForwarding implements inet.Stack.SetForwarding. +func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { + return syserror.EACCES +} diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index 721094bbf..8aea0200f 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -6,6 +6,8 @@ go_library( name = "netfilter", srcs = [ "extensions.go", + "ipv4.go", + "ipv6.go", "netfilter.go", "owner_matcher.go", "targets.go", diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go index 0336a32d8..549787955 100644 --- a/pkg/sentry/socket/netfilter/extensions.go +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -19,6 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -37,7 +39,7 @@ type matchMaker interface { // name is the matcher name as stored in the xt_entry_match struct. name() string - // marshal converts from an stack.Matcher to an ABI struct. + // marshal converts from a stack.Matcher to an ABI struct. marshal(matcher stack.Matcher) []byte // unmarshal converts from the ABI matcher struct to an @@ -93,3 +95,71 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf } return matchMaker.unmarshal(buf, filter) } + +// targetMaker knows how to (un)marshal a target. Once registered, +// marshalTarget and unmarshalTarget can be used. +type targetMaker interface { + // id uniquely identifies the target. + id() stack.TargetID + + // marshal converts from a stack.Target to an ABI struct. + marshal(target stack.Target) []byte + + // unmarshal converts from the ABI matcher struct to a stack.Target. + unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) +} + +// targetMakers maps the TargetID of supported targets to the targetMaker that +// marshals and unmarshals it. It is immutable after package initialization. +var targetMakers = map[stack.TargetID]targetMaker{} + +func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) { + tid := stack.TargetID{ + Name: name, + NetworkProtocol: netProto, + Revision: rev, + } + if _, ok := targetMakers[tid]; !ok { + return 0, false + } + + // Return the highest supported revision unless rev is higher. + for _, other := range targetMakers { + otherID := other.id() + if name == otherID.Name && netProto == otherID.NetworkProtocol && otherID.Revision > rev { + rev = uint8(otherID.Revision) + } + } + return rev, true +} + +// registerTargetMaker should be called by target extensions to register them +// with the netfilter package. +func registerTargetMaker(tm targetMaker) { + if _, ok := targetMakers[tm.id()]; ok { + panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id())) + } + targetMakers[tm.id()] = tm +} + +func marshalTarget(target stack.Target) []byte { + targetMaker, ok := targetMakers[target.ID()] + if !ok { + panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.ID())) + } + return targetMaker.marshal(target) +} + +func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (stack.Target, *syserr.Error) { + tid := stack.TargetID{ + Name: target.Name.String(), + NetworkProtocol: filter.NetworkProtocol(), + Revision: target.Revision, + } + targetMaker, ok := targetMakers[tid] + if !ok { + nflog("unsupported target with name %q", target.Name.String()) + return nil, syserr.ErrInvalidArgument + } + return targetMaker.unmarshal(buf, filter) +} diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go new file mode 100644 index 000000000..b560fae0d --- /dev/null +++ b/pkg/sentry/socket/netfilter/ipv4.go @@ -0,0 +1,265 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +// emptyIPv4Filter is for comparison with a rule's filters to determine whether +// it is also empty. It is immutable. +var emptyIPv4Filter = stack.IPHeaderFilter{ + Dst: "\x00\x00\x00\x00", + DstMask: "\x00\x00\x00\x00", + Src: "\x00\x00\x00\x00", + SrcMask: "\x00\x00\x00\x00", +} + +// convertNetstackToBinary4 converts the iptables as stored in netstack to the +// format expected by the iptables tool. Linux stores each table as a binary +// blob that can only be traversed by parsing a little data, reading some +// offsets, jumping to those offsets, parsing again, etc. +func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) { + // The table name has to fit in the struct. + if linux.XT_TABLE_MAXNAMELEN < len(tablename) { + return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename) + } + + table, ok := stk.IPTables().GetTable(tablename.String(), false) + if !ok { + return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename) + } + + // Setup the info struct. + entries, info := getEntries4(table, tablename) + return entries, info, nil +} + +func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) { + var info linux.IPTGetinfo + var entries linux.KernelIPTGetEntries + copy(info.Name[:], tablename[:]) + copy(entries.Name[:], info.Name[:]) + info.ValidHooks = table.ValidHooks() + + for ruleIdx, rule := range table.Rules { + nflog("convert to binary: current offset: %d", entries.Size) + + setHooksAndUnderflow(&info, table, entries.Size, ruleIdx) + // Each rule corresponds to an entry. + entry := linux.KernelIPTEntry{ + Entry: linux.IPTEntry{ + IP: linux.IPTIP{ + Protocol: uint16(rule.Filter.Protocol), + }, + NextOffset: linux.SizeOfIPTEntry, + TargetOffset: linux.SizeOfIPTEntry, + }, + } + copy(entry.Entry.IP.Dst[:], rule.Filter.Dst) + copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask) + copy(entry.Entry.IP.Src[:], rule.Filter.Src) + copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask) + copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface) + copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + if rule.Filter.DstInvert { + entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP + } + if rule.Filter.SrcInvert { + entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP + } + if rule.Filter.OutputInterfaceInvert { + entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT + } + + for _, matcher := range rule.Matchers { + // Serialize the matcher and add it to the + // entry. + serialized := marshalMatcher(matcher) + nflog("convert to binary: matcher serialized as: %v", serialized) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) + } + entry.Elems = append(entry.Elems, serialized...) + entry.Entry.NextOffset += uint16(len(serialized)) + entry.Entry.TargetOffset += uint16(len(serialized)) + } + + // Serialize and append the target. + serialized := marshalTarget(rule.Target) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) + } + entry.Elems = append(entry.Elems, serialized...) + entry.Entry.NextOffset += uint16(len(serialized)) + + nflog("convert to binary: adding entry: %+v", entry) + + entries.Size += uint32(entry.Entry.NextOffset) + entries.Entrytable = append(entries.Entrytable, entry) + info.NumEntries++ + } + + info.Size = entries.Size + nflog("convert to binary: finished with an marshalled size of %d", info.Size) + return entries, info +} + +func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { + nflog("set entries: setting entries in table %q", replace.Name.String()) + + // Convert input into a list of rules and their offsets. + var offset uint32 + // offsets maps rule byte offsets to their position in table.Rules. + offsets := map[uint32]int{} + for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { + nflog("set entries: processing entry at offset %d", offset) + + // Get the struct ipt_entry. + if len(optVal) < linux.SizeOfIPTEntry { + nflog("optVal has insufficient size for entry %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + var entry linux.IPTEntry + buf := optVal[:linux.SizeOfIPTEntry] + binary.Unmarshal(buf, usermem.ByteOrder, &entry) + initialOptValLen := len(optVal) + optVal = optVal[linux.SizeOfIPTEntry:] + + if entry.TargetOffset < linux.SizeOfIPTEntry { + nflog("entry has too-small target offset %d", entry.TargetOffset) + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): We should support more IPTIP + // filtering fields. + filter, err := filterFromIPTIP(entry.IP) + if err != nil { + nflog("bad iptip: %v", err) + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): Matchers and targets can specify + // that they only work for certain protocols, hooks, tables. + // Get matchers. + matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry + if len(optVal) < int(matchersSize) { + nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + matchers, err := parseMatchers(filter, optVal[:matchersSize]) + if err != nil { + nflog("failed to parse matchers: %v", err) + return nil, syserr.ErrInvalidArgument + } + optVal = optVal[matchersSize:] + + // Get the target of the rule. + targetSize := entry.NextOffset - entry.TargetOffset + if len(optVal) < int(targetSize) { + nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + + rule := stack.Rule{ + Filter: filter, + Matchers: matchers, + } + + { + target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */) + if err != nil { + nflog("failed to parse target: %v", err) + return nil, err + } + rule.Target = target + } + optVal = optVal[targetSize:] + + table.Rules = append(table.Rules, rule) + offsets[offset] = int(entryIdx) + offset += uint32(entry.NextOffset) + + if initialOptValLen-len(optVal) != int(entry.NextOffset) { + nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + return nil, syserr.ErrInvalidArgument + } + } + return offsets, nil +} + +func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { + if containsUnsupportedFields4(iptip) { + return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) + } + if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) + } + if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) + } + + n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) + if n == -1 { + n = len(iptip.OutputInterface) + } + ifname := string(iptip.OutputInterface[:n]) + + n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) + if n == -1 { + n = len(iptip.OutputInterfaceMask) + } + ifnameMask := string(iptip.OutputInterfaceMask[:n]) + + return stack.IPHeaderFilter{ + Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), + // A Protocol value of 0 indicates all protocols match. + CheckProtocol: iptip.Protocol != 0, + Dst: tcpip.Address(iptip.Dst[:]), + DstMask: tcpip.Address(iptip.DstMask[:]), + DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, + Src: tcpip.Address(iptip.Src[:]), + SrcMask: tcpip.Address(iptip.SrcMask[:]), + SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, + OutputInterface: ifname, + OutputInterfaceMask: ifnameMask, + OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, + }, nil +} + +func containsUnsupportedFields4(iptip linux.IPTIP) bool { + // The following features are supported: + // - Protocol + // - Dst and DstMask + // - Src and SrcMask + // - The inverse destination IP check flag + // - OutputInterface, OutputInterfaceMask and its inverse. + var emptyInterface = [linux.IFNAMSIZ]byte{} + // Disable any supported inverse flags. + inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) + return iptip.InputInterface != emptyInterface || + iptip.InputInterfaceMask != emptyInterface || + iptip.Flags != 0 || + iptip.InverseFlags&^inverseMask != 0 +} diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go new file mode 100644 index 000000000..4253f7bf4 --- /dev/null +++ b/pkg/sentry/socket/netfilter/ipv6.go @@ -0,0 +1,270 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netfilter + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" +) + +// emptyIPv6Filter is for comparison with a rule's filters to determine whether +// it is also empty. It is immutable. +var emptyIPv6Filter = stack.IPHeaderFilter{ + Dst: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + DstMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + Src: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + SrcMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", +} + +// convertNetstackToBinary6 converts the ip6tables as stored in netstack to the +// format expected by the iptables tool. Linux stores each table as a binary +// blob that can only be traversed by parsing a little data, reading some +// offsets, jumping to those offsets, parsing again, etc. +func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) { + // The table name has to fit in the struct. + if linux.XT_TABLE_MAXNAMELEN < len(tablename) { + return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename) + } + + table, ok := stk.IPTables().GetTable(tablename.String(), true) + if !ok { + return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename) + } + + // Setup the info struct, which is the same in IPv4 and IPv6. + entries, info := getEntries6(table, tablename) + return entries, info, nil +} + +func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) { + var info linux.IPTGetinfo + var entries linux.KernelIP6TGetEntries + copy(info.Name[:], tablename[:]) + copy(entries.Name[:], info.Name[:]) + info.ValidHooks = table.ValidHooks() + + for ruleIdx, rule := range table.Rules { + nflog("convert to binary: current offset: %d", entries.Size) + + setHooksAndUnderflow(&info, table, entries.Size, ruleIdx) + // Each rule corresponds to an entry. + entry := linux.KernelIP6TEntry{ + Entry: linux.IP6TEntry{ + IPv6: linux.IP6TIP{ + Protocol: uint16(rule.Filter.Protocol), + }, + NextOffset: linux.SizeOfIP6TEntry, + TargetOffset: linux.SizeOfIP6TEntry, + }, + } + copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst) + copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask) + copy(entry.Entry.IPv6.Src[:], rule.Filter.Src) + copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask) + copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface) + copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) + if rule.Filter.DstInvert { + entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP + } + if rule.Filter.SrcInvert { + entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP + } + if rule.Filter.OutputInterfaceInvert { + entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT + } + if rule.Filter.CheckProtocol { + entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO + } + + for _, matcher := range rule.Matchers { + // Serialize the matcher and add it to the + // entry. + serialized := marshalMatcher(matcher) + nflog("convert to binary: matcher serialized as: %v", serialized) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) + } + entry.Elems = append(entry.Elems, serialized...) + entry.Entry.NextOffset += uint16(len(serialized)) + entry.Entry.TargetOffset += uint16(len(serialized)) + } + + // Serialize and append the target. + serialized := marshalTarget(rule.Target) + if len(serialized)%8 != 0 { + panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) + } + entry.Elems = append(entry.Elems, serialized...) + entry.Entry.NextOffset += uint16(len(serialized)) + + nflog("convert to binary: adding entry: %+v", entry) + + entries.Size += uint32(entry.Entry.NextOffset) + entries.Entrytable = append(entries.Entrytable, entry) + info.NumEntries++ + } + + info.Size = entries.Size + nflog("convert to binary: finished with an marshalled size of %d", info.Size) + return entries, info +} + +func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { + nflog("set entries: setting entries in table %q", replace.Name.String()) + + // Convert input into a list of rules and their offsets. + var offset uint32 + // offsets maps rule byte offsets to their position in table.Rules. + offsets := map[uint32]int{} + for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { + nflog("set entries: processing entry at offset %d", offset) + + // Get the struct ipt_entry. + if len(optVal) < linux.SizeOfIP6TEntry { + nflog("optVal has insufficient size for entry %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + var entry linux.IP6TEntry + buf := optVal[:linux.SizeOfIP6TEntry] + binary.Unmarshal(buf, usermem.ByteOrder, &entry) + initialOptValLen := len(optVal) + optVal = optVal[linux.SizeOfIP6TEntry:] + + if entry.TargetOffset < linux.SizeOfIP6TEntry { + nflog("entry has too-small target offset %d", entry.TargetOffset) + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): We should support more IPTIP + // filtering fields. + filter, err := filterFromIP6TIP(entry.IPv6) + if err != nil { + nflog("bad iptip: %v", err) + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): Matchers and targets can specify + // that they only work for certain protocols, hooks, tables. + // Get matchers. + matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry + if len(optVal) < int(matchersSize) { + nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + matchers, err := parseMatchers(filter, optVal[:matchersSize]) + if err != nil { + nflog("failed to parse matchers: %v", err) + return nil, syserr.ErrInvalidArgument + } + optVal = optVal[matchersSize:] + + // Get the target of the rule. + targetSize := entry.NextOffset - entry.TargetOffset + if len(optVal) < int(targetSize) { + nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + + rule := stack.Rule{ + Filter: filter, + Matchers: matchers, + } + + { + target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */) + if err != nil { + nflog("failed to parse target: %v", err) + return nil, err + } + rule.Target = target + } + optVal = optVal[targetSize:] + + table.Rules = append(table.Rules, rule) + offsets[offset] = int(entryIdx) + offset += uint32(entry.NextOffset) + + if initialOptValLen-len(optVal) != int(entry.NextOffset) { + nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) + return nil, syserr.ErrInvalidArgument + } + } + return offsets, nil +} + +func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) { + if containsUnsupportedFields6(iptip) { + return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) + } + if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) + } + if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize { + return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) + } + + n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) + if n == -1 { + n = len(iptip.OutputInterface) + } + ifname := string(iptip.OutputInterface[:n]) + + n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) + if n == -1 { + n = len(iptip.OutputInterfaceMask) + } + ifnameMask := string(iptip.OutputInterfaceMask[:n]) + + return stack.IPHeaderFilter{ + Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), + // In ip6tables a flag controls whether to check the protocol. + CheckProtocol: iptip.Flags&linux.IP6T_F_PROTO != 0, + Dst: tcpip.Address(iptip.Dst[:]), + DstMask: tcpip.Address(iptip.DstMask[:]), + DstInvert: iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0, + Src: tcpip.Address(iptip.Src[:]), + SrcMask: tcpip.Address(iptip.SrcMask[:]), + SrcInvert: iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0, + OutputInterface: ifname, + OutputInterfaceMask: ifnameMask, + OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0, + }, nil +} + +func containsUnsupportedFields6(iptip linux.IP6TIP) bool { + // The following features are supported: + // - Protocol + // - Dst and DstMask + // - Src and SrcMask + // - The inverse destination IP check flag + // - OutputInterface, OutputInterfaceMask and its inverse. + var emptyInterface = [linux.IFNAMSIZ]byte{} + flagMask := uint8(linux.IP6T_F_PROTO) + // Disable any supported inverse flags. + inverseMask := uint8(linux.IP6T_INV_DSTIP) | uint8(linux.IP6T_INV_SRCIP) | uint8(linux.IP6T_INV_VIA_OUT) + return iptip.InputInterface != emptyInterface || + iptip.InputInterfaceMask != emptyInterface || + iptip.Flags&^flagMask != 0 || + iptip.InverseFlags&^inverseMask != 0 || + iptip.TOS != 0 +} diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index 47ff48c00..904a12e38 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -17,7 +17,6 @@ package netfilter import ( - "bytes" "errors" "fmt" @@ -27,47 +26,15 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) -// errorTargetName is used to mark targets as error targets. Error targets -// shouldn't be reached - an error has occurred if we fall through to one. -const errorTargetName = "ERROR" - -// redirectTargetName is used to mark targets as redirect targets. Redirect -// targets should be reached for only NAT and Mangle tables. These targets will -// change the destination port/destination IP for packets. -const redirectTargetName = "REDIRECT" - -// Metadata is used to verify that we are correctly serializing and -// deserializing iptables into structs consumable by the iptables tool. We save -// a metadata struct when the tables are written, and when they are read out we -// verify that certain fields are the same. -// -// metadata is used by this serialization/deserializing code, not netstack. -type metadata struct { - HookEntry [linux.NF_INET_NUMHOOKS]uint32 - Underflow [linux.NF_INET_NUMHOOKS]uint32 - NumEntries uint32 - Size uint32 -} - // enableLogging controls whether to log the (de)serialization of netfilter // structs between userspace and netstack. These logs are useful when // developing iptables, but can pollute sentry logs otherwise. const enableLogging = false -// emptyFilter is for comparison with a rule's filters to determine whether it -// is also empty. It is immutable. -var emptyFilter = stack.IPHeaderFilter{ - Dst: "\x00\x00\x00\x00", - DstMask: "\x00\x00\x00\x00", - Src: "\x00\x00\x00\x00", - SrcMask: "\x00\x00\x00\x00", -} - // nflog logs messages related to the writing and reading of iptables. func nflog(format string, args ...interface{}) { if enableLogging && log.IsLogging(log.Debug) { @@ -76,65 +43,44 @@ func nflog(format string, args ...interface{}) { } // GetInfo returns information about iptables. -func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) { +func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) { // Read in the struct and table name. var info linux.IPTGetinfo - if _, err := t.CopyIn(outPtr, &info); err != nil { + if _, err := info.CopyIn(t, outPtr); err != nil { return linux.IPTGetinfo{}, syserr.FromError(err) } - // Find the appropriate table. - table, err := findTable(stack, info.Name) + var err error + if ipv6 { + _, info, err = convertNetstackToBinary6(stack, info.Name) + } else { + _, info, err = convertNetstackToBinary4(stack, info.Name) + } if err != nil { - nflog("%v", err) + nflog("couldn't convert iptables: %v", err) return linux.IPTGetinfo{}, syserr.ErrInvalidArgument } - // Get the hooks that apply to this table. - info.ValidHooks = table.ValidHooks() - - // Grab the metadata struct, which is used to store information (e.g. - // the number of entries) that applies to the user's encoding of - // iptables, but not netstack's. - metadata := table.Metadata().(metadata) - - // Set values from metadata. - info.HookEntry = metadata.HookEntry - info.Underflow = metadata.Underflow - info.NumEntries = metadata.NumEntries - info.Size = metadata.Size - nflog("returning info: %+v", info) - return info, nil } -// GetEntries returns netstack's iptables rules encoded for the iptables tool. -func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { +// GetEntries4 returns netstack's iptables rules. +func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) { // Read in the struct and table name. var userEntries linux.IPTGetEntries - if _, err := t.CopyIn(outPtr, &userEntries); err != nil { + if _, err := userEntries.CopyIn(t, outPtr); err != nil { nflog("couldn't copy in entries %q", userEntries.Name) return linux.KernelIPTGetEntries{}, syserr.FromError(err) } - // Find the appropriate table. - table, err := findTable(stack, userEntries.Name) - if err != nil { - nflog("%v", err) - return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument - } - // Convert netstack's iptables rules to something that the iptables // tool can understand. - entries, meta, err := convertNetstackToBinary(userEntries.Name.String(), table) + entries, _, err := convertNetstackToBinary4(stack, userEntries.Name) if err != nil { nflog("couldn't read entries: %v", err) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument } - if meta != table.Metadata().(metadata) { - panic(fmt.Sprintf("Table %q metadata changed between writing and reading. Was saved as %+v, but is now %+v", userEntries.Name.String(), table.Metadata().(metadata), meta)) - } if binary.Size(entries) > uintptr(outLen) { nflog("insufficient GetEntries output size: %d", uintptr(outLen)) return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument @@ -143,258 +89,53 @@ func GetEntries(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen return entries, nil } -func findTable(stk *stack.Stack, tablename linux.TableName) (stack.Table, error) { - ipt := stk.IPTables() - table, ok := ipt.Tables[tablename.String()] - if !ok { - return stack.Table{}, fmt.Errorf("couldn't find table %q", tablename) - } - return table, nil -} - -// FillDefaultIPTables sets stack's IPTables to the default tables and -// populates them with metadata. -func FillDefaultIPTables(stk *stack.Stack) { - ipt := stack.DefaultTables() - - // In order to fill in the metadata, we have to translate ipt from its - // netstack format to Linux's giant-binary-blob format. - for name, table := range ipt.Tables { - _, metadata, err := convertNetstackToBinary(name, table) - if err != nil { - panic(fmt.Errorf("Unable to set default IP tables: %v", err)) - } - table.SetMetadata(metadata) - ipt.Tables[name] = table - } - - stk.SetIPTables(ipt) -} - -// convertNetstackToBinary converts the iptables as stored in netstack to the -// format expected by the iptables tool. Linux stores each table as a binary -// blob that can only be traversed by parsing a bit, reading some offsets, -// jumping to those offsets, parsing again, etc. -func convertNetstackToBinary(tablename string, table stack.Table) (linux.KernelIPTGetEntries, metadata, error) { - // Return values. - var entries linux.KernelIPTGetEntries - var meta metadata - - // The table name has to fit in the struct. - if linux.XT_TABLE_MAXNAMELEN < len(tablename) { - return linux.KernelIPTGetEntries{}, metadata{}, fmt.Errorf("table name %q too long.", tablename) - } - copy(entries.Name[:], tablename) - - for ruleIdx, rule := range table.Rules { - nflog("convert to binary: current offset: %d", entries.Size) - - // Is this a chain entry point? - for hook, hookRuleIdx := range table.BuiltinChains { - if hookRuleIdx == ruleIdx { - nflog("convert to binary: found hook %d at offset %d", hook, entries.Size) - meta.HookEntry[hook] = entries.Size - } - } - // Is this a chain underflow point? - for underflow, underflowRuleIdx := range table.Underflows { - if underflowRuleIdx == ruleIdx { - nflog("convert to binary: found underflow %d at offset %d", underflow, entries.Size) - meta.Underflow[underflow] = entries.Size - } - } - - // Each rule corresponds to an entry. - entry := linux.KernelIPTEntry{ - IPTEntry: linux.IPTEntry{ - IP: linux.IPTIP{ - Protocol: uint16(rule.Filter.Protocol), - }, - NextOffset: linux.SizeOfIPTEntry, - TargetOffset: linux.SizeOfIPTEntry, - }, - } - copy(entry.IPTEntry.IP.Dst[:], rule.Filter.Dst) - copy(entry.IPTEntry.IP.DstMask[:], rule.Filter.DstMask) - copy(entry.IPTEntry.IP.Src[:], rule.Filter.Src) - copy(entry.IPTEntry.IP.SrcMask[:], rule.Filter.SrcMask) - copy(entry.IPTEntry.IP.OutputInterface[:], rule.Filter.OutputInterface) - copy(entry.IPTEntry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask) - if rule.Filter.DstInvert { - entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_DSTIP - } - if rule.Filter.SrcInvert { - entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_SRCIP - } - if rule.Filter.OutputInterfaceInvert { - entry.IPTEntry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT - } - - for _, matcher := range rule.Matchers { - // Serialize the matcher and add it to the - // entry. - serialized := marshalMatcher(matcher) - nflog("convert to binary: matcher serialized as: %v", serialized) - if len(serialized)%8 != 0 { - panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher)) - } - entry.Elems = append(entry.Elems, serialized...) - entry.NextOffset += uint16(len(serialized)) - entry.TargetOffset += uint16(len(serialized)) - } - - // Serialize and append the target. - serialized := marshalTarget(rule.Target) - if len(serialized)%8 != 0 { - panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target)) - } - entry.Elems = append(entry.Elems, serialized...) - entry.NextOffset += uint16(len(serialized)) - - nflog("convert to binary: adding entry: %+v", entry) - - entries.Size += uint32(entry.NextOffset) - entries.Entrytable = append(entries.Entrytable, entry) - meta.NumEntries++ - } - - nflog("convert to binary: finished with an marshalled size of %d", meta.Size) - meta.Size = entries.Size - return entries, meta, nil -} - -func marshalTarget(target stack.Target) []byte { - switch tg := target.(type) { - case stack.AcceptTarget: - return marshalStandardTarget(stack.RuleAccept) - case stack.DropTarget: - return marshalStandardTarget(stack.RuleDrop) - case stack.ErrorTarget: - return marshalErrorTarget(errorTargetName) - case stack.UserChainTarget: - return marshalErrorTarget(tg.Name) - case stack.ReturnTarget: - return marshalStandardTarget(stack.RuleReturn) - case stack.RedirectTarget: - return marshalRedirectTarget(tg) - case JumpTarget: - return marshalJumpTarget(tg) - default: - panic(fmt.Errorf("unknown target of type %T", target)) - } -} - -func marshalStandardTarget(verdict stack.RuleVerdict) []byte { - nflog("convert to binary: marshalling standard target") - - // The target's name will be the empty string. - target := linux.XTStandardTarget{ - Target: linux.XTEntryTarget{ - TargetSize: linux.SizeOfXTStandardTarget, - }, - Verdict: translateFromStandardVerdict(verdict), - } - - ret := make([]byte, 0, linux.SizeOfXTStandardTarget) - return binary.Marshal(ret, usermem.ByteOrder, target) -} - -func marshalErrorTarget(errorName string) []byte { - // This is an error target named error - target := linux.XTErrorTarget{ - Target: linux.XTEntryTarget{ - TargetSize: linux.SizeOfXTErrorTarget, - }, - } - copy(target.Name[:], errorName) - copy(target.Target.Name[:], errorTargetName) - - ret := make([]byte, 0, linux.SizeOfXTErrorTarget) - return binary.Marshal(ret, usermem.ByteOrder, target) -} - -func marshalRedirectTarget(rt stack.RedirectTarget) []byte { - // This is a redirect target named redirect - target := linux.XTRedirectTarget{ - Target: linux.XTEntryTarget{ - TargetSize: linux.SizeOfXTRedirectTarget, - }, +// GetEntries6 returns netstack's ip6tables rules. +func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) { + // Read in the struct and table name. IPv4 and IPv6 utilize structs + // with the same layout. + var userEntries linux.IPTGetEntries + if _, err := userEntries.CopyIn(t, outPtr); err != nil { + nflog("couldn't copy in entries %q", userEntries.Name) + return linux.KernelIP6TGetEntries{}, syserr.FromError(err) } - copy(target.Target.Name[:], redirectTargetName) - ret := make([]byte, 0, linux.SizeOfXTRedirectTarget) - target.NfRange.RangeSize = 1 - if rt.RangeProtoSpecified { - target.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED + // Convert netstack's iptables rules to something that the iptables + // tool can understand. + entries, _, err := convertNetstackToBinary6(stack, userEntries.Name) + if err != nil { + nflog("couldn't read entries: %v", err) + return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument } - // Convert port from little endian to big endian. - port := make([]byte, 2) - binary.LittleEndian.PutUint16(port, rt.MinPort) - target.NfRange.RangeIPV4.MinPort = binary.BigEndian.Uint16(port) - binary.LittleEndian.PutUint16(port, rt.MaxPort) - target.NfRange.RangeIPV4.MaxPort = binary.BigEndian.Uint16(port) - return binary.Marshal(ret, usermem.ByteOrder, target) -} - -func marshalJumpTarget(jt JumpTarget) []byte { - nflog("convert to binary: marshalling jump target") - - // The target's name will be the empty string. - target := linux.XTStandardTarget{ - Target: linux.XTEntryTarget{ - TargetSize: linux.SizeOfXTStandardTarget, - }, - // Verdict is overloaded by the ABI. When positive, it holds - // the jump offset from the start of the table. - Verdict: int32(jt.Offset), + if binary.Size(entries) > uintptr(outLen) { + nflog("insufficient GetEntries output size: %d", uintptr(outLen)) + return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument } - ret := make([]byte, 0, linux.SizeOfXTStandardTarget) - return binary.Marshal(ret, usermem.ByteOrder, target) + return entries, nil } -// translateFromStandardVerdict translates verdicts the same way as the iptables -// tool. -func translateFromStandardVerdict(verdict stack.RuleVerdict) int32 { - switch verdict { - case stack.RuleAccept: - return -linux.NF_ACCEPT - 1 - case stack.RuleDrop: - return -linux.NF_DROP - 1 - case stack.RuleReturn: - return linux.NF_RETURN - default: - // TODO(gvisor.dev/issue/170): Support Jump. - panic(fmt.Sprintf("unknown standard verdict: %d", verdict)) +// setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint +// or underflow, in which case it fills in info.HookEntry and info.Underflows. +func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) { + // Is this a chain entry point? + for hook, hookRuleIdx := range table.BuiltinChains { + if hookRuleIdx == ruleIdx { + nflog("convert to binary: found hook %d at offset %d", hook, offset) + info.HookEntry[hook] = offset + } } -} - -// translateToStandardTarget translates from the value in a -// linux.XTStandardTarget to an stack.Verdict. -func translateToStandardTarget(val int32) (stack.Target, error) { - // TODO(gvisor.dev/issue/170): Support other verdicts. - switch val { - case -linux.NF_ACCEPT - 1: - return stack.AcceptTarget{}, nil - case -linux.NF_DROP - 1: - return stack.DropTarget{}, nil - case -linux.NF_QUEUE - 1: - return nil, errors.New("unsupported iptables verdict QUEUE") - case linux.NF_RETURN: - return stack.ReturnTarget{}, nil - default: - return nil, fmt.Errorf("unknown iptables verdict %d", val) + // Is this a chain underflow point? + for underflow, underflowRuleIdx := range table.Underflows { + if underflowRuleIdx == ruleIdx { + nflog("convert to binary: found underflow %d at offset %d", underflow, offset) + info.Underflow[underflow] = offset + } } } // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. -func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { - // Get the basic rules data (struct ipt_replace). - if len(optVal) < linux.SizeOfIPTReplace { - nflog("optVal has insufficient size for replace %d", len(optVal)) - return syserr.ErrInvalidArgument - } +func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { var replace linux.IPTReplace replaceBuf := optVal[:linux.SizeOfIPTReplace] optVal = optVal[linux.SizeOfIPTReplace:] @@ -403,88 +144,24 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // TODO(gvisor.dev/issue/170): Support other tables. var table stack.Table switch replace.Name.String() { - case stack.TablenameFilter: + case stack.FilterTable: table = stack.EmptyFilterTable() - case stack.TablenameNat: - table = stack.EmptyNatTable() + case stack.NATTable: + table = stack.EmptyNATTable() default: nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) return syserr.ErrInvalidArgument } - nflog("set entries: setting entries in table %q", replace.Name.String()) - - // Convert input into a list of rules and their offsets. - var offset uint32 - // offsets maps rule byte offsets to their position in table.Rules. - offsets := map[uint32]int{} - for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ { - nflog("set entries: processing entry at offset %d", offset) - - // Get the struct ipt_entry. - if len(optVal) < linux.SizeOfIPTEntry { - nflog("optVal has insufficient size for entry %d", len(optVal)) - return syserr.ErrInvalidArgument - } - var entry linux.IPTEntry - buf := optVal[:linux.SizeOfIPTEntry] - binary.Unmarshal(buf, usermem.ByteOrder, &entry) - initialOptValLen := len(optVal) - optVal = optVal[linux.SizeOfIPTEntry:] - - if entry.TargetOffset < linux.SizeOfIPTEntry { - nflog("entry has too-small target offset %d", entry.TargetOffset) - return syserr.ErrInvalidArgument - } - - // TODO(gvisor.dev/issue/170): We should support more IPTIP - // filtering fields. - filter, err := filterFromIPTIP(entry.IP) - if err != nil { - nflog("bad iptip: %v", err) - return syserr.ErrInvalidArgument - } - - // TODO(gvisor.dev/issue/170): Matchers and targets can specify - // that they only work for certain protocols, hooks, tables. - // Get matchers. - matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry - if len(optVal) < int(matchersSize) { - nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) - return syserr.ErrInvalidArgument - } - matchers, err := parseMatchers(filter, optVal[:matchersSize]) - if err != nil { - nflog("failed to parse matchers: %v", err) - return syserr.ErrInvalidArgument - } - optVal = optVal[matchersSize:] - - // Get the target of the rule. - targetSize := entry.NextOffset - entry.TargetOffset - if len(optVal) < int(targetSize) { - nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal)) - return syserr.ErrInvalidArgument - } - target, err := parseTarget(filter, optVal[:targetSize]) - if err != nil { - nflog("failed to parse target: %v", err) - return syserr.ErrInvalidArgument - } - optVal = optVal[targetSize:] - - table.Rules = append(table.Rules, stack.Rule{ - Filter: filter, - Target: target, - Matchers: matchers, - }) - offsets[offset] = int(entryIdx) - offset += uint32(entry.NextOffset) - - if initialOptValLen-len(optVal) != int(entry.NextOffset) { - nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal)) - return syserr.ErrInvalidArgument - } + var err *syserr.Error + var offsets map[uint32]int + if ipv6 { + offsets, err = modifyEntries6(stk, optVal, &replace, &table) + } else { + offsets, err = modifyEntries4(stk, optVal, &replace, &table) + } + if err != nil { + return err } // Go through the list of supported hooks for this table and, for each @@ -492,12 +169,14 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { for hook, _ := range replace.HookEntry { if table.ValidHooks()&(1<<hook) != 0 { hk := hookFromLinux(hook) + table.BuiltinChains[hk] = stack.HookUnset + table.Underflows[hk] = stack.HookUnset for offset, ruleIdx := range offsets { if offset == replace.HookEntry[hook] { table.BuiltinChains[hk] = ruleIdx } if offset == replace.Underflow[hook] { - if !validUnderflow(table.Rules[ruleIdx]) { + if !validUnderflow(table.Rules[ruleIdx], ipv6) { nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx) return syserr.ErrInvalidArgument } @@ -515,10 +194,9 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { } } - // Add the user chains. + // Check the user chains. for ruleIdx, rule := range table.Rules { - target, ok := rule.Target.(stack.UserChainTarget) - if !ok { + if _, ok := rule.Target.(*stack.UserChainTarget); !ok { continue } @@ -534,13 +212,12 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { nflog("user chain's first node must have no matchers") return syserr.ErrInvalidArgument } - table.UserChains[target.Name] = ruleIdx + 1 } // Set each jump to point to the appropriate rule. Right now they hold byte // offsets. for ruleIdx, rule := range table.Rules { - jump, ok := rule.Target.(JumpTarget) + jump, ok := rule.Target.(*JumpTarget) if !ok { continue } @@ -560,8 +237,11 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // Since we only support modifying the INPUT, PREROUTING and OUTPUT chain right now, // make sure all other chains point to ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { - if hook == stack.Forward || hook == stack.Postrouting { - if !isUnconditionalAccept(table.Rules[ruleIdx]) { + if hook := stack.Hook(hook); hook == stack.Forward || hook == stack.Postrouting { + if ruleIdx == stack.HookUnset { + continue + } + if !isUnconditionalAccept(table.Rules[ruleIdx], ipv6) { nflog("hook %d is unsupported.", hook) return syserr.ErrInvalidArgument } @@ -573,17 +253,8 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error { // - There are no chains without an unconditional final rule. // - There are no chains without an unconditional underflow rule. - ipt := stk.IPTables() - table.SetMetadata(metadata{ - HookEntry: replace.HookEntry, - Underflow: replace.Underflow, - NumEntries: replace.NumEntries, - Size: replace.Size, - }) - ipt.Tables[replace.Name.String()] = table - stk.SetIPTables(ipt) - - return nil + return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table, ipv6)) + } // parseMatchers parses 0 or more matchers from optVal. optVal should contain @@ -605,7 +276,6 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, // Check some invariants. if match.MatchSize < linux.SizeOfXTEntryMatch { - return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch) } if len(optVal) < int(match.MatchSize) { @@ -630,186 +300,26 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, return matchers, nil } -// parseTarget parses a target from optVal. optVal should contain only the -// target. -func parseTarget(filter stack.IPHeaderFilter, optVal []byte) (stack.Target, error) { - nflog("set entries: parsing target of size %d", len(optVal)) - if len(optVal) < linux.SizeOfXTEntryTarget { - return nil, fmt.Errorf("optVal has insufficient size for entry target %d", len(optVal)) - } - var target linux.XTEntryTarget - buf := optVal[:linux.SizeOfXTEntryTarget] - binary.Unmarshal(buf, usermem.ByteOrder, &target) - switch target.Name.String() { - case "": - // Standard target. - if len(optVal) != linux.SizeOfXTStandardTarget { - return nil, fmt.Errorf("optVal has wrong size for standard target %d", len(optVal)) - } - var standardTarget linux.XTStandardTarget - buf = optVal[:linux.SizeOfXTStandardTarget] - binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget) - - if standardTarget.Verdict < 0 { - // A Verdict < 0 indicates a non-jump verdict. - return translateToStandardTarget(standardTarget.Verdict) - } - // A verdict >= 0 indicates a jump. - return JumpTarget{Offset: uint32(standardTarget.Verdict)}, nil - - case errorTargetName: - // Error target. - if len(optVal) != linux.SizeOfXTErrorTarget { - return nil, fmt.Errorf("optVal has insufficient size for error target %d", len(optVal)) - } - var errorTarget linux.XTErrorTarget - buf = optVal[:linux.SizeOfXTErrorTarget] - binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget) - - // Error targets are used in 2 cases: - // * An actual error case. These rules have an error - // named errorTargetName. The last entry of the table - // is usually an error case to catch any packets that - // somehow fall through every rule. - // * To mark the start of a user defined chain. These - // rules have an error with the name of the chain. - switch name := errorTarget.Name.String(); name { - case errorTargetName: - nflog("set entries: error target") - return stack.ErrorTarget{}, nil - default: - // User defined chain. - nflog("set entries: user-defined target %q", name) - return stack.UserChainTarget{Name: name}, nil - } - - case redirectTargetName: - // Redirect target. - if len(optVal) < linux.SizeOfXTRedirectTarget { - return nil, fmt.Errorf("netfilter.SetEntries: optVal has insufficient size for redirect target %d", len(optVal)) - } - - if filter.Protocol != header.TCPProtocolNumber && filter.Protocol != header.UDPProtocolNumber { - return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") - } - - var redirectTarget linux.XTRedirectTarget - buf = optVal[:linux.SizeOfXTRedirectTarget] - binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget) - - // Copy linux.XTRedirectTarget to stack.RedirectTarget. - var target stack.RedirectTarget - nfRange := redirectTarget.NfRange - - // RangeSize should be 1. - if nfRange.RangeSize != 1 { - return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") - } - - // TODO(gvisor.dev/issue/170): Check if the flags are valid. - // Also check if we need to map ports or IP. - // For now, redirect target only supports destination port change. - // Port range and IP range are not supported yet. - if nfRange.RangeIPV4.Flags&linux.NF_NAT_RANGE_PROTO_SPECIFIED == 0 { - return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") - } - target.RangeProtoSpecified = true - - target.MinIP = tcpip.Address(nfRange.RangeIPV4.MinIP[:]) - target.MaxIP = tcpip.Address(nfRange.RangeIPV4.MaxIP[:]) - - // TODO(gvisor.dev/issue/170): Port range is not supported yet. - if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { - return nil, fmt.Errorf("netfilter.SetEntries: invalid argument") - } - - // Convert port from big endian to little endian. - port := make([]byte, 2) - binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MinPort) - target.MinPort = binary.LittleEndian.Uint16(port) - - binary.BigEndian.PutUint16(port, nfRange.RangeIPV4.MaxPort) - target.MaxPort = binary.LittleEndian.Uint16(port) - return target, nil - } - - // Unknown target. - return nil, fmt.Errorf("unknown target %q doesn't exist or isn't supported yet.", target.Name.String()) -} - -func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) { - if containsUnsupportedFields(iptip) { - return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip) - } - if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize { - return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask)) - } - if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize { - return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask)) - } - - n := bytes.IndexByte([]byte(iptip.OutputInterface[:]), 0) - if n == -1 { - n = len(iptip.OutputInterface) - } - ifname := string(iptip.OutputInterface[:n]) - - n = bytes.IndexByte([]byte(iptip.OutputInterfaceMask[:]), 0) - if n == -1 { - n = len(iptip.OutputInterfaceMask) - } - ifnameMask := string(iptip.OutputInterfaceMask[:n]) - - return stack.IPHeaderFilter{ - Protocol: tcpip.TransportProtocolNumber(iptip.Protocol), - Dst: tcpip.Address(iptip.Dst[:]), - DstMask: tcpip.Address(iptip.DstMask[:]), - DstInvert: iptip.InverseFlags&linux.IPT_INV_DSTIP != 0, - Src: tcpip.Address(iptip.Src[:]), - SrcMask: tcpip.Address(iptip.SrcMask[:]), - SrcInvert: iptip.InverseFlags&linux.IPT_INV_SRCIP != 0, - OutputInterface: ifname, - OutputInterfaceMask: ifnameMask, - OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0, - }, nil -} - -func containsUnsupportedFields(iptip linux.IPTIP) bool { - // The following features are supported: - // - Protocol - // - Dst and DstMask - // - Src and SrcMask - // - The inverse destination IP check flag - // - OutputInterface, OutputInterfaceMask and its inverse. - var emptyInterface = [linux.IFNAMSIZ]byte{} - // Disable any supported inverse flags. - inverseMask := uint8(linux.IPT_INV_DSTIP) | uint8(linux.IPT_INV_SRCIP) | uint8(linux.IPT_INV_VIA_OUT) - return iptip.InputInterface != emptyInterface || - iptip.InputInterfaceMask != emptyInterface || - iptip.Flags != 0 || - iptip.InverseFlags&^inverseMask != 0 -} - -func validUnderflow(rule stack.Rule) bool { +func validUnderflow(rule stack.Rule, ipv6 bool) bool { if len(rule.Matchers) != 0 { return false } - if rule.Filter != emptyFilter { + if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) { return false } switch rule.Target.(type) { - case stack.AcceptTarget, stack.DropTarget: + case *stack.AcceptTarget, *stack.DropTarget: return true default: return false } } -func isUnconditionalAccept(rule stack.Rule) bool { - if !validUnderflow(rule) { +func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool { + if !validUnderflow(rule, ipv6) { return false } - _, ok := rule.Target.(stack.AcceptTarget) + _, ok := rule.Target.(*stack.AcceptTarget) return ok } @@ -828,3 +338,20 @@ func hookFromLinux(hook int) stack.Hook { } panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook)) } + +// TargetRevision returns a linux.XTGetRevision for a given target. It sets +// Revision to the highest supported value, unless the provided revision number +// is larger. +func TargetRevision(t *kernel.Task, revPtr usermem.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) { + // Read in the target name and version. + var rev linux.XTGetRevision + if _, err := rev.CopyIn(t, revPtr); err != nil { + return linux.XTGetRevision{}, syserr.FromError(err) + } + maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision) + if !ok { + return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported + } + rev.Revision = maxSupported + return rev, nil +} diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 3863293c7..1b4e0ad79 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -111,7 +111,7 @@ func (*OwnerMatcher) Name() string { } // Match implements Matcher.Match. -func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { +func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { // Support only for OUTPUT chain. // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. if hook != stack.Output { diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go index 84abe8d29..0e14447fe 100644 --- a/pkg/sentry/socket/netfilter/targets.go +++ b/pkg/sentry/socket/netfilter/targets.go @@ -15,10 +15,359 @@ package netfilter import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" ) +func init() { + // Standard targets include ACCEPT, DROP, RETURN, and JUMP. + registerTargetMaker(&standardTargetMaker{ + NetworkProtocol: header.IPv4ProtocolNumber, + }) + registerTargetMaker(&standardTargetMaker{ + NetworkProtocol: header.IPv6ProtocolNumber, + }) + + // Both user chains and actual errors are represented in iptables by + // error targets. + registerTargetMaker(&errorTargetMaker{ + NetworkProtocol: header.IPv4ProtocolNumber, + }) + registerTargetMaker(&errorTargetMaker{ + NetworkProtocol: header.IPv6ProtocolNumber, + }) + + registerTargetMaker(&redirectTargetMaker{ + NetworkProtocol: header.IPv4ProtocolNumber, + }) + registerTargetMaker(&nfNATTargetMaker{ + NetworkProtocol: header.IPv6ProtocolNumber, + }) +} + +type standardTargetMaker struct { + NetworkProtocol tcpip.NetworkProtocolNumber +} + +func (sm *standardTargetMaker) id() stack.TargetID { + // Standard targets have the empty string as a name and no revisions. + return stack.TargetID{ + NetworkProtocol: sm.NetworkProtocol, + } +} +func (*standardTargetMaker) marshal(target stack.Target) []byte { + // Translate verdicts the same way as the iptables tool. + var verdict int32 + switch tg := target.(type) { + case *stack.AcceptTarget: + verdict = -linux.NF_ACCEPT - 1 + case *stack.DropTarget: + verdict = -linux.NF_DROP - 1 + case *stack.ReturnTarget: + verdict = linux.NF_RETURN + case *JumpTarget: + verdict = int32(tg.Offset) + default: + panic(fmt.Errorf("unknown target of type %T", target)) + } + + // The target's name will be the empty string. + xt := linux.XTStandardTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTStandardTarget, + }, + Verdict: verdict, + } + + ret := make([]byte, 0, linux.SizeOfXTStandardTarget) + return binary.Marshal(ret, usermem.ByteOrder, xt) +} + +func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) { + if len(buf) != linux.SizeOfXTStandardTarget { + nflog("buf has wrong size for standard target %d", len(buf)) + return nil, syserr.ErrInvalidArgument + } + var standardTarget linux.XTStandardTarget + buf = buf[:linux.SizeOfXTStandardTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget) + + if standardTarget.Verdict < 0 { + // A Verdict < 0 indicates a non-jump verdict. + return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol()) + } + // A verdict >= 0 indicates a jump. + return &JumpTarget{ + Offset: uint32(standardTarget.Verdict), + NetworkProtocol: filter.NetworkProtocol(), + }, nil +} + +type errorTargetMaker struct { + NetworkProtocol tcpip.NetworkProtocolNumber +} + +func (em *errorTargetMaker) id() stack.TargetID { + // Error targets have no revision. + return stack.TargetID{ + Name: stack.ErrorTargetName, + NetworkProtocol: em.NetworkProtocol, + } +} + +func (*errorTargetMaker) marshal(target stack.Target) []byte { + var errorName string + switch tg := target.(type) { + case *stack.ErrorTarget: + errorName = stack.ErrorTargetName + case *stack.UserChainTarget: + errorName = tg.Name + default: + panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target)) + } + + // This is an error target named error + xt := linux.XTErrorTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTErrorTarget, + }, + } + copy(xt.Name[:], errorName) + copy(xt.Target.Name[:], stack.ErrorTargetName) + + ret := make([]byte, 0, linux.SizeOfXTErrorTarget) + return binary.Marshal(ret, usermem.ByteOrder, xt) +} + +func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) { + if len(buf) != linux.SizeOfXTErrorTarget { + nflog("buf has insufficient size for error target %d", len(buf)) + return nil, syserr.ErrInvalidArgument + } + var errorTarget linux.XTErrorTarget + buf = buf[:linux.SizeOfXTErrorTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget) + + // Error targets are used in 2 cases: + // * An actual error case. These rules have an error + // named stack.ErrorTargetName. The last entry of the table + // is usually an error case to catch any packets that + // somehow fall through every rule. + // * To mark the start of a user defined chain. These + // rules have an error with the name of the chain. + switch name := errorTarget.Name.String(); name { + case stack.ErrorTargetName: + return &stack.ErrorTarget{NetworkProtocol: filter.NetworkProtocol()}, nil + default: + // User defined chain. + return &stack.UserChainTarget{ + Name: name, + NetworkProtocol: filter.NetworkProtocol(), + }, nil + } +} + +type redirectTargetMaker struct { + NetworkProtocol tcpip.NetworkProtocolNumber +} + +func (rm *redirectTargetMaker) id() stack.TargetID { + return stack.TargetID{ + Name: stack.RedirectTargetName, + NetworkProtocol: rm.NetworkProtocol, + } +} + +func (*redirectTargetMaker) marshal(target stack.Target) []byte { + rt := target.(*stack.RedirectTarget) + // This is a redirect target named redirect + xt := linux.XTRedirectTarget{ + Target: linux.XTEntryTarget{ + TargetSize: linux.SizeOfXTRedirectTarget, + }, + } + copy(xt.Target.Name[:], stack.RedirectTargetName) + + ret := make([]byte, 0, linux.SizeOfXTRedirectTarget) + xt.NfRange.RangeSize = 1 + xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED + xt.NfRange.RangeIPV4.MinPort = htons(rt.Port) + xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort + return binary.Marshal(ret, usermem.ByteOrder, xt) +} + +func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) { + if len(buf) < linux.SizeOfXTRedirectTarget { + nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf)) + return nil, syserr.ErrInvalidArgument + } + + if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { + nflog("redirectTargetMaker: bad proto %d", p) + return nil, syserr.ErrInvalidArgument + } + + var redirectTarget linux.XTRedirectTarget + buf = buf[:linux.SizeOfXTRedirectTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget) + + // Copy linux.XTRedirectTarget to stack.RedirectTarget. + target := stack.RedirectTarget{NetworkProtocol: filter.NetworkProtocol()} + + // RangeSize should be 1. + nfRange := redirectTarget.NfRange + if nfRange.RangeSize != 1 { + nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize) + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): Check if the flags are valid. + // Also check if we need to map ports or IP. + // For now, redirect target only supports destination port change. + // Port range and IP range are not supported yet. + if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED { + nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags) + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/170): Port range is not supported yet. + if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { + nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) + return nil, syserr.ErrInvalidArgument + } + if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP { + nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) + return nil, syserr.ErrInvalidArgument + } + + target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:]) + target.Port = ntohs(nfRange.RangeIPV4.MinPort) + + return &target, nil +} + +type nfNATTarget struct { + Target linux.XTEntryTarget + Range linux.NFNATRange +} + +const nfNATMarhsalledSize = linux.SizeOfXTEntryTarget + linux.SizeOfNFNATRange + +type nfNATTargetMaker struct { + NetworkProtocol tcpip.NetworkProtocolNumber +} + +func (rm *nfNATTargetMaker) id() stack.TargetID { + return stack.TargetID{ + Name: stack.RedirectTargetName, + NetworkProtocol: rm.NetworkProtocol, + } +} + +func (*nfNATTargetMaker) marshal(target stack.Target) []byte { + rt := target.(*stack.RedirectTarget) + nt := nfNATTarget{ + Target: linux.XTEntryTarget{ + TargetSize: nfNATMarhsalledSize, + }, + Range: linux.NFNATRange{ + Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED, + }, + } + copy(nt.Target.Name[:], stack.RedirectTargetName) + copy(nt.Range.MinAddr[:], rt.Addr) + copy(nt.Range.MaxAddr[:], rt.Addr) + + nt.Range.MinProto = htons(rt.Port) + nt.Range.MaxProto = nt.Range.MinProto + + ret := make([]byte, 0, nfNATMarhsalledSize) + return binary.Marshal(ret, usermem.ByteOrder, nt) +} + +func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) { + if size := nfNATMarhsalledSize; len(buf) < size { + nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size) + return nil, syserr.ErrInvalidArgument + } + + if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber { + nflog("nfNATTargetMaker: bad proto %d", p) + return nil, syserr.ErrInvalidArgument + } + + var natRange linux.NFNATRange + buf = buf[linux.SizeOfXTEntryTarget:nfNATMarhsalledSize] + binary.Unmarshal(buf, usermem.ByteOrder, &natRange) + + // We don't support port or address ranges. + if natRange.MinAddr != natRange.MaxAddr { + nflog("nfNATTargetMaker: MinAddr and MaxAddr are different") + return nil, syserr.ErrInvalidArgument + } + if natRange.MinProto != natRange.MaxProto { + nflog("nfNATTargetMaker: MinProto and MaxProto are different") + return nil, syserr.ErrInvalidArgument + } + + // TODO(gvisor.dev/issue/3549): Check for other flags. + // For now, redirect target only supports destination change. + if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED { + nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags) + return nil, syserr.ErrInvalidArgument + } + + target := stack.RedirectTarget{ + NetworkProtocol: filter.NetworkProtocol(), + Addr: tcpip.Address(natRange.MinAddr[:]), + Port: ntohs(natRange.MinProto), + } + + return &target, nil +} + +// translateToStandardTarget translates from the value in a +// linux.XTStandardTarget to an stack.Verdict. +func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (stack.Target, *syserr.Error) { + // TODO(gvisor.dev/issue/170): Support other verdicts. + switch val { + case -linux.NF_ACCEPT - 1: + return &stack.AcceptTarget{NetworkProtocol: netProto}, nil + case -linux.NF_DROP - 1: + return &stack.DropTarget{NetworkProtocol: netProto}, nil + case -linux.NF_QUEUE - 1: + nflog("unsupported iptables verdict QUEUE") + return nil, syserr.ErrInvalidArgument + case linux.NF_RETURN: + return &stack.ReturnTarget{NetworkProtocol: netProto}, nil + default: + nflog("unknown iptables verdict %d", val) + return nil, syserr.ErrInvalidArgument + } +} + +// parseTarget parses a target from optVal. optVal should contain only the +// target. +func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) { + nflog("set entries: parsing target of size %d", len(optVal)) + if len(optVal) < linux.SizeOfXTEntryTarget { + nflog("optVal has insufficient size for entry target %d", len(optVal)) + return nil, syserr.ErrInvalidArgument + } + var target linux.XTEntryTarget + buf := optVal[:linux.SizeOfXTEntryTarget] + binary.Unmarshal(buf, usermem.ByteOrder, &target) + + return unmarshalTarget(target, filter, optVal) +} + // JumpTarget implements stack.Target. type JumpTarget struct { // Offset is the byte offset of the rule to jump to. It is used for @@ -27,9 +376,31 @@ type JumpTarget struct { // RuleNum is the rule to jump to. RuleNum int + + // NetworkProtocol is the network protocol the target is used with. + NetworkProtocol tcpip.NetworkProtocolNumber +} + +// ID implements Target.ID. +func (jt *JumpTarget) ID() stack.TargetID { + return stack.TargetID{ + NetworkProtocol: jt.NetworkProtocol, + } } // Action implements stack.Target.Action. -func (jt JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrackTable, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) { +func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.GSO, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) { return stack.RuleJump, jt.RuleNum } + +func ntohs(port uint16) uint16 { + buf := make([]byte, 2) + binary.BigEndian.PutUint16(buf, port) + return usermem.ByteOrder.Uint16(buf) +} + +func htons(port uint16) uint16 { + buf := make([]byte, 2) + usermem.ByteOrder.PutUint16(buf, port) + return binary.BigEndian.Uint16(buf) +} diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 57a1e1c12..352c51390 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -71,7 +71,7 @@ func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma } if filter.Protocol != header.TCPProtocolNumber { - return nil, fmt.Errorf("TCP matching is only valid for protocol %d.", header.TCPProtocolNumber) + return nil, fmt.Errorf("TCP matching is only valid for protocol %d", header.TCPProtocolNumber) } return &TCPMatcher{ @@ -96,51 +96,41 @@ func (*TCPMatcher) Name() string { } // Match implements Matcher.Match. -func (tm *TCPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader) - - if netHeader.TransportProtocol() != header.TCPProtocolNumber { - return false, false - } - - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - return false, true +func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { + // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved + // into the stack.Check codepath as matchers are added. + switch pkt.NetworkProtocolNumber { + case header.IPv4ProtocolNumber: + netHeader := header.IPv4(pkt.NetworkHeader().View()) + if netHeader.TransportProtocol() != header.TCPProtocolNumber { + return false, false } - return false, false - } - // Now we need the transport header. However, this may not have been set - // yet. - // TODO(gvisor.dev/issue/170): Parsing the transport header should - // ultimately be moved into the stack.Check codepath as matchers are - // added. - var tcpHeader header.TCP - if pkt.TransportHeader != nil { - tcpHeader = header.TCP(pkt.TransportHeader) - } else { - var length int - if hook == stack.Prerouting { - // The network header hasn't been parsed yet. We have to do it here. - hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) - if !ok { - // There's no valid TCP header here, so we hotdrop the - // packet. + // We don't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { return false, true } - h := header.IPv4(hdr) - pkt.NetworkHeader = hdr - length = int(h.HeaderLength()) + return false, false } - // The TCP header hasn't been parsed yet. We have to do it here. - hdr, ok := pkt.Data.PullUp(length + header.TCPMinimumSize) - if !ok { - // There's no valid TCP header here, so we hotdrop the - // packet. - return false, true + + case header.IPv6ProtocolNumber: + // As in Linux, we do not perform an IPv6 fragment check. See + // xt_action_param.fragoff in + // include/linux/netfilter/x_tables.h. + if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber { + return false, false } - tcpHeader = header.TCP(hdr[length:]) + + default: + // We don't know the network protocol. + return false, false + } + + tcpHeader := header.TCP(pkt.TransportHeader().View()) + if len(tcpHeader) < header.TCPMinimumSize { + // There's no valid TCP header here, so we drop the packet immediately. + return false, true } // Check whether the source and destination ports are within the diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index cfa9e621d..c88d8268d 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -68,7 +68,7 @@ func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Ma } if filter.Protocol != header.UDPProtocolNumber { - return nil, fmt.Errorf("UDP matching is only valid for protocol %d.", header.UDPProtocolNumber) + return nil, fmt.Errorf("UDP matching is only valid for protocol %d", header.UDPProtocolNumber) } return &UDPMatcher{ @@ -93,53 +93,41 @@ func (*UDPMatcher) Name() string { } // Match implements Matcher.Match. -func (um *UDPMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interfaceName string) (bool, bool) { - netHeader := header.IPv4(pkt.NetworkHeader) - +func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) { // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved // into the stack.Check codepath as matchers are added. - if netHeader.TransportProtocol() != header.UDPProtocolNumber { - return false, false - } - - // We dont't match fragments. - if frag := netHeader.FragmentOffset(); frag != 0 { - if frag == 1 { - return false, true + switch pkt.NetworkProtocolNumber { + case header.IPv4ProtocolNumber: + netHeader := header.IPv4(pkt.NetworkHeader().View()) + if netHeader.TransportProtocol() != header.UDPProtocolNumber { + return false, false } - return false, false - } - // Now we need the transport header. However, this may not have been set - // yet. - // TODO(gvisor.dev/issue/170): Parsing the transport header should - // ultimately be moved into the stack.Check codepath as matchers are - // added. - var udpHeader header.UDP - if pkt.TransportHeader != nil { - udpHeader = header.UDP(pkt.TransportHeader) - } else { - var length int - if hook == stack.Prerouting { - // The network header hasn't been parsed yet. We have to do it here. - hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize) - if !ok { - // There's no valid UDP header here, so we hotdrop the - // packet. + // We don't match fragments. + if frag := netHeader.FragmentOffset(); frag != 0 { + if frag == 1 { return false, true } - h := header.IPv4(hdr) - pkt.NetworkHeader = hdr - length = int(h.HeaderLength()) + return false, false } - // The UDP header hasn't been parsed yet. We have to do it here. - hdr, ok := pkt.Data.PullUp(length + header.UDPMinimumSize) - if !ok { - // There's no valid UDP header here, so we hotdrop the - // packet. - return false, true + + case header.IPv6ProtocolNumber: + // As in Linux, we do not perform an IPv6 fragment check. See + // xt_action_param.fragoff in + // include/linux/netfilter/x_tables.h. + if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber { + return false, false } - udpHeader = header.UDP(hdr[length:]) + + default: + // We don't know the network protocol. + return false, false + } + + udpHeader := header.UDP(pkt.TransportHeader().View()) + if len(udpHeader) < header.UDPMinimumSize { + // There's no valid UDP header here, so we drop the packet immediately. + return false, true } // Check whether the source and destination ports are within the diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 7212d8644..1f926aa91 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -16,10 +16,13 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/context", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 0d45e5053..31e374833 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -97,7 +97,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int } d := socket.NewDirent(t, netlinkSocketDevice) - defer d.DecRef() + defer d.DecRef(t) return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil } diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go index bb205be0d..f061c5d62 100644 --- a/pkg/sentry/socket/netlink/provider_vfs2.go +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -51,7 +51,8 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol vfsfd := &s.vfsfd mnt := t.Kernel().SocketMount() - d := sockfs.NewDentry(t.Credentials(), mnt) + d := sockfs.NewDentry(t, mnt) + defer d.DecRef(t) if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index c84d8bd7c..22216158e 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -36,9 +36,9 @@ type commandKind int const ( kindNew commandKind = 0x0 - kindDel = 0x1 - kindGet = 0x2 - kindSet = 0x3 + kindDel commandKind = 0x1 + kindGet commandKind = 0x2 + kindSet commandKind = 0x3 ) func typeKind(typ uint16) commandKind { @@ -423,6 +423,11 @@ func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlin } attrs = rest + // NOTE: A netlink message will contain multiple header attributes. + // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent + // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the + // local interface address. We add the local interface address here + // and ignore the IFA_ADDRESS. switch ahdr.Type { case linux.IFA_LOCAL: err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ @@ -439,8 +444,57 @@ func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlin } else if err != nil { return syserr.ErrInvalidArgument } + case linux.IFA_ADDRESS: + default: + return syserr.ErrNotSupported + } + } + return nil +} + +// delAddr handles RTM_DELADDR requests. +func (p *Protocol) delAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error { + stack := inet.StackFromContext(ctx) + if stack == nil { + // No network stack. + return syserr.ErrProtocolNotSupported + } + + var ifa linux.InterfaceAddrMessage + attrs, ok := msg.GetData(&ifa) + if !ok { + return syserr.ErrInvalidArgument + } + + for !attrs.Empty() { + ahdr, value, rest, ok := attrs.ParseFirst() + if !ok { + return syserr.ErrInvalidArgument + } + attrs = rest + + // NOTE: A netlink message will contain multiple header attributes. + // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent + // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the + // local interface address. We use the local interface address to + // remove the address and ignore the IFA_ADDRESS. + switch ahdr.Type { + case linux.IFA_LOCAL: + err := stack.RemoveInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{ + Family: ifa.Family, + PrefixLen: ifa.PrefixLen, + Flags: ifa.Flags, + Addr: value, + }) + if err != nil { + return syserr.ErrInvalidArgument + } + case linux.IFA_ADDRESS: + default: + return syserr.ErrNotSupported } } + return nil } @@ -485,6 +539,8 @@ func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms return p.dumpRoutes(ctx, msg, ms) case linux.RTM_NEWADDR: return p.newAddr(ctx, msg, ms) + case linux.RTM_DELADDR: + return p.delAddr(ctx, msg, ms) default: return syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 81f34c5a2..3baad098b 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -16,11 +16,14 @@ package netlink import ( + "io" "math" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -138,14 +141,14 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } @@ -162,9 +165,9 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { - s.connection.Release() - s.ep.Close() +func (s *socketOpsCommon) Release(ctx context.Context) { + s.connection.Release(ctx) + s.ep.Close(ctx) if s.bound { s.ports.Release(s.protocol.Protocol(), s.portID) @@ -330,7 +333,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { switch level { case linux.SOL_SOCKET: switch name { @@ -340,24 +343,26 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr } s.mu.Lock() defer s.mu.Unlock() - return int32(s.sendBufferSize), nil + sendBufferSizeP := primitive.Int32(s.sendBufferSize) + return &sendBufferSizeP, nil case linux.SO_RCVBUF: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } // We don't have limit on receiving size. - return int32(math.MaxInt32), nil + recvBufferSizeP := primitive.Int32(math.MaxInt32) + return &recvBufferSizeP, nil case linux.SO_PASSCRED: if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - var passcred int32 + var passcred primitive.Int32 if s.Passcred() { passcred = 1 } - return passcred, nil + return &passcred, nil default: socket.GetSockOptEmitUnimplementedEvent(t, name) @@ -617,7 +622,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys if len(bufs) > 0 { // RecvMsg never receives the address, so we don't need to send // one. - _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{}) // If the buffer is full, we simply drop messages, just like // Linux. if err != nil && err != syserr.ErrWouldBlock { @@ -644,7 +649,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys // Add the dump_done_errno payload. m.Put(int64(0)) - _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) if err != nil && err != syserr.ErrWouldBlock { return err } @@ -744,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t buf := make([]byte, src.NumBytes()) n, err := src.CopyIn(ctx, buf) + // io.EOF can be only returned if src is a file, this means that + // sendMsg is called from splice and the error has to be ignored in + // this case. + if err == io.EOF { + err = nil + } if err != nil { // Don't partially consume messages. return 0, syserr.FromError(err) diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index b854bf990..461d524e5 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" @@ -36,10 +37,13 @@ import ( // to/from the kernel. // // SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer. +// +// +stateify savable type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -55,18 +59,18 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } - return &SocketVFS2{ + fd := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ ports: t.Kernel().NetlinkPorts(), protocol: protocol, @@ -75,7 +79,16 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV connection: connection, sendBufferSize: defaultSendBufferSize, }, - }, nil + } + fd.LockFD.Init(&vfs.FileLocks{}) + return fd, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (s *SocketVFS2) Release(ctx context.Context) { + t := kernel.TaskFromContext(ctx) + t.Kernel().DeleteSocketVFS2(&s.vfsfd) + s.socketOpsCommon.Release(ctx) } // Readiness implements waiter.Waitable.Readiness. @@ -136,3 +149,13 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 333e0042e..fae3b6783 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -22,12 +22,15 @@ go_library( "//pkg/binary", "//pkg/context", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/metric", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -50,5 +53,6 @@ go_library( "//pkg/tcpip/transport/udp", "//pkg/usermem", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 60df51dae..86c634715 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -26,6 +26,7 @@ package netstack import ( "bytes" + "fmt" "io" "math" "reflect" @@ -33,11 +34,14 @@ import ( "syscall" "time" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -154,6 +158,9 @@ var Metrics = tcpip.Stats{ OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."), MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."), MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."), + IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."), + IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."), + IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."), }, TCP: tcpip.TCPStats{ ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), @@ -190,6 +197,7 @@ var Metrics = tcpip.Stats{ MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), + ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."), }, } @@ -230,7 +238,7 @@ type commonEndpoint interface { // SetSockOpt implements tcpip.Endpoint.SetSockOpt and // transport.Endpoint.SetSockOpt. - SetSockOpt(interface{}) *tcpip.Error + SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and // transport.Endpoint.SetSockOptBool. @@ -242,7 +250,7 @@ type commonEndpoint interface { // GetSockOpt implements tcpip.Endpoint.GetSockOpt and // transport.Endpoint.GetSockOpt. - GetSockOpt(interface{}) *tcpip.Error + GetSockOpt(tcpip.GettableSocketOption) *tcpip.Error // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and // transport.Endpoint.GetSockOpt. @@ -251,6 +259,9 @@ type commonEndpoint interface { // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and // transport.Endpoint.GetSockOpt. GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) + + // LastError implements tcpip.Endpoint.LastError. + LastError() *tcpip.Error } // LINT.IfChange @@ -294,8 +305,9 @@ type socketOpsCommon struct { readView buffer.View // readCM holds control message information for the last packet read // from Endpoint. - readCM tcpip.ControlMessages - sender tcpip.FullAddress + readCM tcpip.ControlMessages + sender tcpip.FullAddress + linkPacketInfo tcpip.LinkPacketInfo // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps // of returned messages can be returned via control messages. When @@ -323,7 +335,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue } dirent := socket.NewDirent(t, netstackDevice) - defer dirent.DecRef() + defer dirent.DecRef(t) return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ socketOpsCommon: socketOpsCommon{ Queue: queue, @@ -416,7 +428,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument } - // TODO(b/129292371): Return protocol too. + // TODO(gvisor.dev/issue/173): Return protocol too. return tcpip.FullAddress{ NIC: tcpip.NICID(a.InterfaceIndex), Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]), @@ -444,8 +456,21 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error { } s.readView = nil s.sender = tcpip.FullAddress{} + s.linkPacketInfo = tcpip.LinkPacketInfo{} + + var v buffer.View + var cms tcpip.ControlMessages + var err *tcpip.Error - v, cms, err := s.Endpoint.Read(&s.sender) + switch e := s.Endpoint.(type) { + // The ordering of these interfaces matters. The most specific + // interfaces must be specified before the more generic Endpoint + // interface. + case tcpip.PacketEndpoint: + v, cms, err = e.ReadPacket(&s.sender, &s.linkPacketInfo) + case tcpip.Endpoint: + v, cms, err = e.Read(&s.sender) + } if err != nil { atomic.StoreUint32(&s.readViewHasData, 0) return syserr.TranslateNetstackError(err) @@ -459,8 +484,35 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error { } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(ctx context.Context) { + e, ch := waiter.NewChannelEntry(nil) + s.EventRegister(&e, waiter.EventHUp|waiter.EventErr) + defer s.EventUnregister(&e) + s.Endpoint.Close() + + // SO_LINGER option is valid only for TCP. For other socket types + // return after endpoint close. + if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) { + return + } + + var v tcpip.LingerOption + if err := s.Endpoint.GetSockOpt(&v); err != nil { + return + } + + // The case for zero timeout is handled in tcp endpoint close function. + // Close is blocked until either: + // 1. The endpoint state is not in any of the states: FIN-WAIT1, + // CLOSING and LAST_ACK. + // 2. Timeout is reached. + if v.Enabled && v.Timeout != 0 { + t := kernel.TaskFromContext(ctx) + start := t.Kernel().MonotonicClock().Now() + deadline := start.Add(v.Timeout) + t.BlockWithDeadline(ch, true, deadline) + } } // Read implements fs.FileOperations.Read. @@ -535,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) { } v := buffer.NewView(size) if _, err := i.src.CopyIn(i.ctx, v); err != nil { + // EOF can be returned only if src is a file and this means it + // is in a splice syscall and the error has to be ignored. + if err == io.EOF { + return v, nil + } return nil, tcpip.ErrBadAddress } return v, nil @@ -719,6 +776,14 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool defer s.EventUnregister(&e) if err := s.Endpoint.Connect(addr); err != tcpip.ErrConnectStarted && err != tcpip.ErrAlreadyConnecting { + if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { + // TCP unlike UDP returns EADDRNOTAVAIL when it can't + // find an available local ephemeral port. + if err == tcpip.ErrNoPortAvailable { + return syserr.ErrAddressNotAvailable + } + } + return syserr.TranslateNetstackError(err) } @@ -775,7 +840,20 @@ func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Issue the bind request to the endpoint. - return syserr.TranslateNetstackError(s.Endpoint.Bind(addr)) + err := s.Endpoint.Bind(addr) + if err == tcpip.ErrNoPortAvailable { + // Bind always returns EADDRINUSE irrespective of if the specified port was + // already bound or if an ephemeral port was requested but none were + // available. + // + // tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because + // UDP connect returns EAGAIN on ephemeral port exhaustion. + // + // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. + err = tcpip.ErrPortInUse + } + + return syserr.TranslateNetstackError(err) } // Listen implements the linux syscall listen(2) for sockets backed by @@ -786,7 +864,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. -func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { +func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(nil) s.EventRegister(&e, waiter.EventIn) @@ -795,7 +873,7 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite // Try to accept the connection again; if it fails, then wait until we // get a notification. for { - if ep, wq, err := s.Endpoint.Accept(); err != tcpip.ErrWouldBlock { + if ep, wq, err := s.Endpoint.Accept(peerAddr); err != tcpip.ErrWouldBlock { return ep, wq, syserr.TranslateNetstackError(err) } @@ -808,15 +886,18 @@ func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waite // Accept implements the linux syscall accept(2) for sockets backed by // tcpip.Endpoint. func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { - // Issue the accept request to get the new endpoint. - ep, wq, terr := s.Endpoint.Accept() + var peerAddr *tcpip.FullAddress + if peerRequested { + peerAddr = &tcpip.FullAddress{} + } + ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { if terr != tcpip.ErrWouldBlock || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } var err *syserr.Error - ep, wq, err = s.blockingAccept(t) + ep, wq, err = s.blockingAccept(t, peerAddr) if err != nil { return 0, nil, 0, err } @@ -826,7 +907,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() @@ -836,13 +917,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, var addr linux.SockAddr var addrLen uint32 - if peerRequested { - // Get address of the peer and write it to peer slice. - var err *syserr.Error - addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t) - if err != nil { - return 0, nil, 0, err - } + if peerAddr != nil { + addr, addrLen = ConvertAddress(s.family, *peerAddr) } fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ @@ -884,7 +960,7 @@ func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is // implemented specifically for netstack.SocketOperations rather than // commonEndpoint. commonEndpoint should be extended to support socket @@ -894,68 +970,33 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptTimestamp { val = 1 } - return val, nil + return &val, nil } if level == linux.SOL_TCP && name == linux.TCP_INQ { if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptInq { val = 1 } - return val, nil + return &val, nil } - if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { - switch name { - case linux.IPT_SO_GET_INFO: - if outLen < linux.SizeOfIPTGetinfo { - return nil, syserr.ErrInvalidArgument - } - - stack := inet.StackFromContext(t) - if stack == nil { - return nil, syserr.ErrNoDevice - } - info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) - if err != nil { - return nil, err - } - return info, nil - - case linux.IPT_SO_GET_ENTRIES: - if outLen < linux.SizeOfIPTGetEntries { - return nil, syserr.ErrInvalidArgument - } - - stack := inet.StackFromContext(t) - if stack == nil { - return nil, syserr.ErrNoDevice - } - entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) - if err != nil { - return nil, err - } - return entries, nil - - } - } - - return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen) } // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -964,10 +1005,10 @@ func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family in return getSockOptTCP(t, ep, name, outLen) case linux.SOL_IPV6: - return getSockOptIPv6(t, ep, name, outLen) + return getSockOptIPv6(t, s, ep, name, outPtr, outLen) case linux.SOL_IP: - return getSockOptIP(t, ep, name, outLen, family) + return getSockOptIP(t, s, ep, name, outPtr, outLen, family) case linux.SOL_UDP, linux.SOL_ICMPV6, @@ -988,7 +1029,7 @@ func boolToInt32(v bool) int32 { } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -997,11 +1038,14 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } // Get the last error and convert it. - err := ep.GetSockOpt(tcpip.ErrorOption{}) + err := ep.LastError() if err == nil { - return int32(0), nil + optP := primitive.Int32(0) + return &optP, nil } - return int32(syserr.TranslateNetstackError(err).ToLinux().Number()), nil + + optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux().Number()) + return &optP, nil case linux.SO_PEERCRED: if family != linux.AF_UNIX || outLen < syscall.SizeofUcred { @@ -1009,11 +1053,12 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } tcred := t.Credentials() - return syscall.Ucred{ - Pid: int32(t.ThreadGroup().ID()), - Uid: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), - Gid: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), - }, nil + creds := linux.ControlMessageCredentials{ + PID: int32(t.ThreadGroup().ID()), + UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), + GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), + } + return &creds, nil case linux.SO_PASSCRED: if outLen < sizeOfInt32 { @@ -1024,7 +1069,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_SNDBUF: if outLen < sizeOfInt32 { @@ -1040,7 +1087,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam size = math.MaxInt32 } - return int32(size), nil + sizeP := primitive.Int32(size) + return &sizeP, nil case linux.SO_RCVBUF: if outLen < sizeOfInt32 { @@ -1056,7 +1104,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam size = math.MaxInt32 } - return int32(size), nil + sizeP := primitive.Int32(size) + return &sizeP, nil case linux.SO_REUSEADDR: if outLen < sizeOfInt32 { @@ -1067,7 +1116,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_REUSEPORT: if outLen < sizeOfInt32 { @@ -1078,7 +1128,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_BINDTODEVICE: var v tcpip.BindToDeviceOption @@ -1086,7 +1138,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.TranslateNetstackError(err) } if v == 0 { - return []byte{}, nil + var b primitive.ByteSlice + return &b, nil } if outLen < linux.IFNAMSIZ { return nil, syserr.ErrInvalidArgument @@ -1101,7 +1154,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam // interface was removed. return nil, syserr.ErrUnknownDevice } - return append([]byte(nic.Name), 0), nil + + name := primitive.ByteSlice(append([]byte(nic.Name), 0)) + return &name, nil case linux.SO_BROADCAST: if outLen < sizeOfInt32 { @@ -1112,7 +1167,9 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_KEEPALIVE: if outLen < sizeOfInt32 { @@ -1123,13 +1180,26 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.SO_LINGER: if outLen < linux.SizeOfLinger { return nil, syserr.ErrInvalidArgument } - return linux.Linger{}, nil + + var v tcpip.LingerOption + var linger linux.Linger + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + if v.Enabled { + linger.OnOff = 1 + } + linger.Linger = int32(v.Timeout.Seconds()) + return &linger, nil case linux.SO_SNDTIMEO: // TODO(igudger): Linux allows shorter lengths for partial results. @@ -1137,7 +1207,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - return linux.NsecToTimeval(s.SendTimeout()), nil + sendTimeout := linux.NsecToTimeval(s.SendTimeout()) + return &sendTimeout, nil case linux.SO_RCVTIMEO: // TODO(igudger): Linux allows shorter lengths for partial results. @@ -1145,7 +1216,8 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.ErrInvalidArgument } - return linux.NsecToTimeval(s.RecvTimeout()), nil + recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) + return &recvTimeout, nil case linux.SO_OOBINLINE: if outLen < sizeOfInt32 { @@ -1157,7 +1229,32 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam return nil, syserr.TranslateNetstackError(err) } - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil + + case linux.SO_NO_CHECK: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.NoChecksumOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil + + case linux.SO_ACCEPTCONN: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptBool(tcpip.AcceptConnOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil default: socket.GetSockOptEmitUnimplementedEvent(t, name) @@ -1166,7 +1263,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam } // getSockOptTCP implements GetSockOpt when level is SOL_TCP. -func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { switch name { case linux.TCP_NODELAY: if outLen < sizeOfInt32 { @@ -1177,7 +1274,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(!v), nil + + vP := primitive.Int32(boolToInt32(!v)) + return &vP, nil case linux.TCP_CORK: if outLen < sizeOfInt32 { @@ -1188,7 +1287,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.TCP_QUICKACK: if outLen < sizeOfInt32 { @@ -1199,7 +1300,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.TCP_MAXSEG: if outLen < sizeOfInt32 { @@ -1210,8 +1313,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.TCP_KEEPIDLE: if outLen < sizeOfInt32 { @@ -1222,8 +1325,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(time.Duration(v) / time.Second), nil + keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second) + return &keepAliveIdle, nil case linux.TCP_KEEPINTVL: if outLen < sizeOfInt32 { @@ -1234,8 +1337,20 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } + keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second) + return &keepAliveInterval, nil - return int32(time.Duration(v) / time.Second), nil + case linux.TCP_KEEPCNT: + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + + v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + vP := primitive.Int32(v) + return &vP, nil case linux.TCP_USER_TIMEOUT: if outLen < sizeOfInt32 { @@ -1246,8 +1361,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(time.Duration(v) / time.Millisecond), nil + tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond) + return &tcpUserTimeout, nil case linux.TCP_INFO: var v tcpip.TCPInfoOption @@ -1260,12 +1375,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa info := linux.TCPInfo{} // Linux truncates the output binary to outLen. - ib := binary.Marshal(nil, usermem.ByteOrder, &info) - if len(ib) > outLen { - ib = ib[:outLen] + buf := t.CopyScratchBuffer(info.SizeBytes()) + info.MarshalUnsafe(buf) + if len(buf) > outLen { + buf = buf[:outLen] } - - return ib, nil + bufP := primitive.ByteSlice(buf) + return &bufP, nil case linux.TCP_CC_INFO, linux.TCP_NOTSENT_LOWAT, @@ -1295,7 +1411,9 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa } b := make([]byte, toCopy) copy(b, v) - return b, nil + + bP := primitive.ByteSlice(b) + return &bP, nil case linux.TCP_LINGER2: if outLen < sizeOfInt32 { @@ -1306,8 +1424,13 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err := ep.GetSockOpt(&v); err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(time.Duration(v) / time.Second), nil + var lingerTimeout primitive.Int32 + if v >= 0 { + lingerTimeout = primitive.Int32(time.Duration(v) / time.Second) + } else { + lingerTimeout = -1 + } + return &lingerTimeout, nil case linux.TCP_DEFER_ACCEPT: if outLen < sizeOfInt32 { @@ -1319,7 +1442,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa return nil, syserr.TranslateNetstackError(err) } - return int32(time.Duration(v) / time.Second), nil + tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second) + return &tcpDeferAccept, nil case linux.TCP_SYNCNT: if outLen < sizeOfInt32 { @@ -1330,8 +1454,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.TCP_WINDOW_CLAMP: if outLen < sizeOfInt32 { @@ -1342,8 +1466,8 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa if err != nil { return nil, syserr.TranslateNetstackError(err) } - - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil default: emitUnimplementedEventTCP(t, name) } @@ -1351,7 +1475,7 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa } // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. -func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { switch name { case linux.IPV6_V6ONLY: if outLen < sizeOfInt32 { @@ -1362,7 +1486,9 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.IPV6_PATHMTU: t.Kernel().EmitUnimplementedEvent(t) @@ -1370,21 +1496,24 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf case linux.IPV6_TCLASS: // Length handling for parity with Linux. if outLen == 0 { - return make([]byte, 0), nil + var b primitive.ByteSlice + return &b, nil } v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } - uintv := uint32(v) + uintv := primitive.Uint32(v) // Linux truncates the output binary to outLen. - ib := binary.Marshal(nil, usermem.ByteOrder, &uintv) + ib := t.CopyScratchBuffer(uintv.SizeBytes()) + uintv.MarshalUnsafe(ib) // Handle cases where outLen is lesser than sizeOfInt32. if len(ib) > outLen { ib = ib[:outLen] } - return ib, nil + ibP := primitive.ByteSlice(ib) + return &ibP, nil case linux.IPV6_RECVTCLASS: if outLen < sizeOfInt32 { @@ -1395,7 +1524,82 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil + + case linux.IP6T_ORIGINAL_DST: + if outLen < int(binary.Size(linux.SockAddrInet6{})) { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.OriginalDestinationOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + a, _ := ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) + return a.(*linux.SockAddrInet6), nil + + case linux.IP6T_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + // Only valid for raw IPv6 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + return nil, syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true) + if err != nil { + return nil, err + } + return &info, nil + + case linux.IP6T_SO_GET_ENTRIES: + // IPTGetEntries is reused for IPv6. + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + // Only valid for raw IPv6 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + return nil, syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return &entries, nil + + case linux.IP6T_SO_GET_REVISION_TARGET: + if outLen < linux.SizeOfXTGetRevision { + return nil, syserr.ErrInvalidArgument + } + + // Only valid for raw IPv6 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + return nil, syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) + if err != nil { + return nil, err + } + return &ret, nil default: emitUnimplementedEventIPv6(t, name) @@ -1404,7 +1608,7 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf } // getSockOptIP implements GetSockOpt when level is SOL_IP. -func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family int) (interface{}, *syserr.Error) { +func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr usermem.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) { switch name { case linux.IP_TTL: if outLen < sizeOfInt32 { @@ -1417,11 +1621,12 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in } // Fill in the default value, if needed. - if v == 0 { - v = DefaultTTL + vP := primitive.Int32(v) + if vP == 0 { + vP = DefaultTTL } - return int32(v), nil + return &vP, nil case linux.IP_MULTICAST_TTL: if outLen < sizeOfInt32 { @@ -1433,7 +1638,8 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in return nil, syserr.TranslateNetstackError(err) } - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.IP_MULTICAST_IF: if outLen < len(linux.InetAddr{}) { @@ -1447,7 +1653,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) - return a.(*linux.SockAddrInet).Addr, nil + return &a.(*linux.SockAddrInet).Addr, nil case linux.IP_MULTICAST_LOOP: if outLen < sizeOfInt32 { @@ -1458,21 +1664,26 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.IP_TOS: // Length handling for parity with Linux. if outLen == 0 { - return []byte(nil), nil + var b primitive.ByteSlice + return &b, nil } v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) if err != nil { return nil, syserr.TranslateNetstackError(err) } if outLen < sizeOfInt32 { - return uint8(v), nil + vP := primitive.Uint8(v) + return &vP, nil } - return int32(v), nil + vP := primitive.Int32(v) + return &vP, nil case linux.IP_RECVTOS: if outLen < sizeOfInt32 { @@ -1483,7 +1694,9 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil case linux.IP_PKTINFO: if outLen < sizeOfInt32 { @@ -1494,7 +1707,82 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in if err != nil { return nil, syserr.TranslateNetstackError(err) } - return boolToInt32(v), nil + + vP := primitive.Int32(boolToInt32(v)) + return &vP, nil + + case linux.SO_ORIGINAL_DST: + if outLen < int(binary.Size(linux.SockAddrInet{})) { + return nil, syserr.ErrInvalidArgument + } + + var v tcpip.OriginalDestinationOption + if err := ep.GetSockOpt(&v); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) + return a.(*linux.SockAddrInet), nil + + case linux.IPT_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + // Only valid for raw IPv4 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { + return nil, syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false) + if err != nil { + return nil, err + } + return &info, nil + + case linux.IPT_SO_GET_ENTRIES: + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + + // Only valid for raw IPv4 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { + return nil, syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return &entries, nil + + case linux.IPT_SO_GET_REVISION_TARGET: + if outLen < linux.SizeOfXTGetRevision { + return nil, syserr.ErrInvalidArgument + } + + // Only valid for raw IPv4 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { + return nil, syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) + if err != nil { + return nil, err + } + return &ret, nil default: emitUnimplementedEventIP(t, name) @@ -1529,26 +1817,6 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa return nil } - if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { - switch name { - case linux.IPT_SO_SET_REPLACE: - if len(optVal) < linux.SizeOfIPTReplace { - return syserr.ErrInvalidArgument - } - - stack := inet.StackFromContext(t) - if stack == nil { - return syserr.ErrNoDevice - } - // Stack must be a netstack stack. - return netfilter.SetEntries(stack.(*Stack).Stack, optVal) - - case linux.IPT_SO_SET_ADD_COUNTERS: - // TODO(gvisor.dev/issue/170): Counter support. - return nil - } - } - return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } @@ -1563,21 +1831,26 @@ func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int return setSockOptTCP(t, ep, name, optVal) case linux.SOL_IPV6: - return setSockOptIPv6(t, ep, name, optVal) + return setSockOptIPv6(t, s, ep, name, optVal) case linux.SOL_IP: - return setSockOptIP(t, ep, name, optVal) + return setSockOptIP(t, s, ep, name, optVal) + + case linux.SOL_PACKET: + // gVisor doesn't support any SOL_PACKET options just return not + // supported. Returning nil here will result in tcpdump thinking AF_PACKET + // features are supported and proceed to use them and break. + t.Kernel().EmitUnimplementedEvent(t) + return syserr.ErrProtocolNotAvailable case linux.SOL_UDP, linux.SOL_ICMPV6, - linux.SOL_RAW, - linux.SOL_PACKET: + linux.SOL_RAW: t.Kernel().EmitUnimplementedEvent(t) } - // Default to the old behavior; hand off to network stack. - return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) + return nil } // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. @@ -1622,7 +1895,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } name := string(optVal[:n]) if name == "" { - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0))) + v := tcpip.BindToDeviceOption(0) + return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) } s := t.NetworkContext() if s == nil { @@ -1630,7 +1904,8 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } for nicID, nic := range s.Interfaces() { if nic.Name == name { - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID))) + v := tcpip.BindToDeviceOption(nicID) + return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) } } return syserr.ErrUnknownDevice @@ -1696,7 +1971,16 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam socket.SetSockOptEmitUnimplementedEvent(t, name) } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.OutOfBandInlineOption(v))) + opt := tcpip.OutOfBandInlineOption(v) + return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) + + case linux.SO_NO_CHECK: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.NoChecksumOption, v != 0)) case linux.SO_LINGER: if len(optVal) < linux.SizeOfLinger { @@ -1710,14 +1994,21 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam socket.SetSockOptEmitUnimplementedEvent(t, name) } - return nil + return syserr.TranslateNetstackError( + ep.SetSockOpt(&tcpip.LingerOption{ + Enabled: v.OnOff != 0, + Timeout: time.Second * time.Duration(v.Linger)})) + + case linux.SO_DETACH_FILTER: + // optval is ignored. + var v tcpip.SocketDetachFilterOption + return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) default: socket.SetSockOptEmitUnimplementedEvent(t, name) } - // Default to the old behavior; hand off to network stack. - return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) + return nil } // setSockOptTCP implements SetSockOpt when level is SOL_TCP. @@ -1764,7 +2055,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * if v < 1 || v > linux.MAX_TCP_KEEPIDLE { return syserr.ErrInvalidArgument } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)))) + opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)) + return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_KEEPINTVL: if len(optVal) < sizeOfInt32 { @@ -1775,7 +2067,19 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * if v < 1 || v > linux.MAX_TCP_KEEPINTVL { return syserr.ErrInvalidArgument } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)))) + opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)) + return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) + + case linux.TCP_KEEPCNT: + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + + v := usermem.ByteOrder.Uint32(optVal) + if v < 1 || v > linux.MAX_TCP_KEEPCNT { + return syserr.ErrInvalidArgument + } + return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v))) case linux.TCP_USER_TIMEOUT: if len(optVal) < sizeOfInt32 { @@ -1786,11 +2090,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * if v < 0 { return syserr.ErrInvalidArgument } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)))) + opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)) + return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_CONGESTION: v := tcpip.CongestionControlOption(optVal) - if err := ep.SetSockOpt(v); err != nil { + if err := ep.SetSockOpt(&v); err != nil { return syserr.TranslateNetstackError(err) } return nil @@ -1800,8 +2105,9 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * return syserr.ErrInvalidArgument } - v := usermem.ByteOrder.Uint32(optVal) - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)))) + v := int32(usermem.ByteOrder.Uint32(optVal)) + opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)) + return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_DEFER_ACCEPT: if len(optVal) < sizeOfInt32 { @@ -1811,7 +2117,8 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * if v < 0 { v = 0 } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)))) + opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)) + return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) case linux.TCP_SYNCNT: if len(optVal) < sizeOfInt32 { @@ -1836,12 +2143,11 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) * emitUnimplementedEventTCP(t, name) } - // Default to the old behavior; hand off to network stack. - return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) + return nil } // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. -func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error { +func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { case linux.IPV6_V6ONLY: if len(optVal) < sizeOfInt32 { @@ -1890,12 +2196,32 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTClassOption, v != 0)) + case linux.IP6T_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIP6TReplace { + return syserr.ErrInvalidArgument + } + + // Only valid for raw IPv6 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET6 || skType != linux.SOCK_RAW { + return syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true) + + case linux.IP6T_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + default: emitUnimplementedEventIPv6(t, name) } - // Default to the old behavior; hand off to network stack. - return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) + return nil } var ( @@ -1950,7 +2276,7 @@ func parseIntOrChar(buf []byte) (int32, *syserr.Error) { } // setSockOptIP implements SetSockOpt when level is SOL_IP. -func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *syserr.Error { +func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { case linux.IP_MULTICAST_TTL: v, err := parseIntOrChar(optVal) @@ -1973,7 +2299,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s return err } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.AddMembershipOption{ + return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ NIC: tcpip.NICID(req.InterfaceIndex), // TODO(igudger): Change AddMembership to use the standard // any address representation. @@ -1987,7 +2313,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s return err } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.RemoveMembershipOption{ + return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ NIC: tcpip.NICID(req.InterfaceIndex), // TODO(igudger): Change DropMembership to use the standard // any address representation. @@ -2001,7 +2327,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s return err } - return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.MulticastInterfaceOption{ + return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{ NIC: tcpip.NICID(req.InterfaceIndex), InterfaceAddr: bytesToIPAddress(req.InterfaceAddr[:]), })) @@ -2060,13 +2386,43 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s } return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, v != 0)) + case linux.IP_HDRINCL: + if len(optVal) == 0 { + return nil + } + v, err := parseIntOrChar(optVal) + if err != nil { + return err + } + return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.IPHdrIncludedOption, v != 0)) + + case linux.IPT_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIPTReplace { + return syserr.ErrInvalidArgument + } + + // Only valid for raw IPv4 sockets. + if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { + return syserr.ErrProtocolNotAvailable + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false) + + case linux.IPT_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + case linux.IP_ADD_SOURCE_MEMBERSHIP, linux.IP_BIND_ADDRESS_NO_PORT, linux.IP_BLOCK_SOURCE, linux.IP_CHECKSUM, linux.IP_DROP_SOURCE_MEMBERSHIP, linux.IP_FREEBIND, - linux.IP_HDRINCL, linux.IP_IPSEC_POLICY, linux.IP_MINTTL, linux.IP_MSFILTER, @@ -2095,8 +2451,7 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s t.Kernel().EmitUnimplementedEvent(t) } - // Default to the old behavior; hand off to network stack. - return syserr.TranslateNetstackError(ep.SetSockOpt(struct{}{})) + return nil } // emitUnimplementedEventTCP emits unimplemented event if name is valid. This @@ -2106,30 +2461,20 @@ func emitUnimplementedEventTCP(t *kernel.Task, name int) { switch name { case linux.TCP_CONGESTION, linux.TCP_CORK, - linux.TCP_DEFER_ACCEPT, linux.TCP_FASTOPEN, linux.TCP_FASTOPEN_CONNECT, linux.TCP_FASTOPEN_KEY, linux.TCP_FASTOPEN_NO_COOKIE, - linux.TCP_KEEPCNT, - linux.TCP_KEEPIDLE, - linux.TCP_KEEPINTVL, - linux.TCP_LINGER2, - linux.TCP_MAXSEG, linux.TCP_QUEUE_SEQ, - linux.TCP_QUICKACK, linux.TCP_REPAIR, linux.TCP_REPAIR_QUEUE, linux.TCP_REPAIR_WINDOW, linux.TCP_SAVED_SYN, linux.TCP_SAVE_SYN, - linux.TCP_SYNCNT, linux.TCP_THIN_DUPACK, linux.TCP_THIN_LINEAR_TIMEOUTS, linux.TCP_TIMESTAMP, - linux.TCP_ULP, - linux.TCP_USER_TIMEOUT, - linux.TCP_WINDOW_CLAMP: + linux.TCP_ULP: t.Kernel().EmitUnimplementedEvent(t) } @@ -2291,7 +2636,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) return &out, uint32(sockAddrInet6Size) case linux.AF_PACKET: - // TODO(b/129292371): Return protocol too. + // TODO(gvisor.dev/issue/173): Return protocol too. var out linux.SockAddrLink out.Family = linux.AF_PACKET out.InterfaceIndex = int32(addr.NIC) @@ -2397,6 +2742,23 @@ func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) { cmsg.IP.Inq = int32(len(s.readView) + rcvBufUsed) } +func toLinuxPacketType(pktType tcpip.PacketType) uint8 { + switch pktType { + case tcpip.PacketHost: + return linux.PACKET_HOST + case tcpip.PacketOtherHost: + return linux.PACKET_OTHERHOST + case tcpip.PacketOutgoing: + return linux.PACKET_OUTGOING + case tcpip.PacketBroadcast: + return linux.PACKET_BROADCAST + case tcpip.PacketMulticast: + return linux.PACKET_MULTICAST + default: + panic(fmt.Sprintf("unknown packet type: %d", pktType)) + } +} + // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. @@ -2452,6 +2814,11 @@ func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSeq var addrLen uint32 if isPacket && senderRequested { addr, addrLen = ConvertAddress(s.family, s.sender) + switch v := addr.(type) { + case *linux.SockAddrLink: + v.Protocol = htons(uint16(s.linkPacketInfo.Protocol)) + v.PacketType = toLinuxPacketType(s.linkPacketInfo.PktType) + } } if peek { @@ -2686,11 +3053,16 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, } func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("ioctl(2) may only be called from a task goroutine") + } + // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. switch args[1].Int() { - case syscall.SIOCGSTAMP: + case linux.SIOCGSTAMP: s.readMu.Lock() defer s.readMu.Unlock() if !s.timestampValid { @@ -2698,9 +3070,7 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy } tv := linux.NsecToTimeval(s.timestampNS) - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tv, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := tv.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCINQ: @@ -2719,9 +3089,8 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + vP := primitive.Int32(v) + _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err } @@ -2730,52 +3099,49 @@ func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.Sy // Ioctl performs a socket ioctl. func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("ioctl(2) may only be called from a task goroutine") + } + switch arg := int(args[1].Int()); arg { - case syscall.SIOCGIFFLAGS, - syscall.SIOCGIFADDR, - syscall.SIOCGIFBRDADDR, - syscall.SIOCGIFDSTADDR, - syscall.SIOCGIFHWADDR, - syscall.SIOCGIFINDEX, - syscall.SIOCGIFMAP, - syscall.SIOCGIFMETRIC, - syscall.SIOCGIFMTU, - syscall.SIOCGIFNAME, - syscall.SIOCGIFNETMASK, - syscall.SIOCGIFTXQLEN: + case linux.SIOCGIFFLAGS, + linux.SIOCGIFADDR, + linux.SIOCGIFBRDADDR, + linux.SIOCGIFDSTADDR, + linux.SIOCGIFHWADDR, + linux.SIOCGIFINDEX, + linux.SIOCGIFMAP, + linux.SIOCGIFMETRIC, + linux.SIOCGIFMTU, + linux.SIOCGIFNAME, + linux.SIOCGIFNETMASK, + linux.SIOCGIFTXQLEN, + linux.SIOCETHTOOL: var ifr linux.IFReq - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { return 0, err.ToError() } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }) + _, err := ifr.CopyOut(t, args[2].Pointer()) return 0, err - case syscall.SIOCGIFCONF: + case linux.SIOCGIFCONF: // Return a list of interface addresses or the buffer size // necessary to hold the list. var ifc linux.IFConf - if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &ifc, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil { return 0, err } - if err := ifconfIoctl(ctx, io, &ifc); err != nil { + if err := ifconfIoctl(ctx, t, io, &ifc); err != nil { return 0, err } - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ifc, usermem.IOOpts{ - AddressSpaceActive: true, - }) - + _, err := ifc.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCINQ: @@ -2788,9 +3154,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc v = math.MaxInt32 } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + vP := primitive.Int32(v) + _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err case linux.TIOCOUTQ: @@ -2804,9 +3169,8 @@ func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.Sysc } // Copy result to userspace. - _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(v), usermem.IOOpts{ - AddressSpaceActive: true, - }) + vP := primitive.Int32(v) + _, err := vP.CopyOut(t, args[2].Pointer()) return 0, err case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: @@ -2832,7 +3196,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to // identify a device. - if arg == syscall.SIOCGIFNAME { + if arg == linux.SIOCGIFNAME { // Gets the name of the interface given the interface index // stored in ifr_ifindex. index = int32(usermem.ByteOrder.Uint32(ifr.Data[:4])) @@ -2855,21 +3219,28 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } switch arg { - case syscall.SIOCGIFINDEX: + case linux.SIOCGIFINDEX: // Copy out the index to the data. usermem.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) - case syscall.SIOCGIFHWADDR: + case linux.SIOCGIFHWADDR: // Copy the hardware address out. - ifr.Data[0] = 6 // IEEE802.2 arp type. - ifr.Data[1] = 0 + // + // Refer: https://linux.die.net/man/7/netdevice + // SIOCGIFHWADDR, SIOCSIFHWADDR + // + // Get or set the hardware address of a device using + // ifr_hwaddr. The hardware address is specified in a struct + // sockaddr. sa_family contains the ARPHRD_* device type, + // sa_data the L2 hardware address starting from byte 0. Setting + // the hardware address is a privileged operation. + usermem.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType) n := copy(ifr.Data[2:], iface.Addr) for i := 2 + n; i < len(ifr.Data); i++ { ifr.Data[i] = 0 // Clear padding. } - usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(n)) - case syscall.SIOCGIFFLAGS: + case linux.SIOCGIFFLAGS: f, err := interfaceStatusFlags(stack, iface.Name) if err != nil { return err @@ -2878,7 +3249,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe // matches Linux behavior. usermem.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) - case syscall.SIOCGIFADDR: + case linux.SIOCGIFADDR: // Copy the IPv4 address out. for _, addr := range stack.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. @@ -2889,32 +3260,32 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe break } - case syscall.SIOCGIFMETRIC: + case linux.SIOCGIFMETRIC: // Gets the metric of the device. As per netdevice(7), this // always just sets ifr_metric to 0. usermem.ByteOrder.PutUint32(ifr.Data[:4], 0) - case syscall.SIOCGIFMTU: + case linux.SIOCGIFMTU: // Gets the MTU of the device. usermem.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) - case syscall.SIOCGIFMAP: + case linux.SIOCGIFMAP: // Gets the hardware parameters of the device. // TODO(gvisor.dev/issue/505): Implement. - case syscall.SIOCGIFTXQLEN: + case linux.SIOCGIFTXQLEN: // Gets the transmit queue length of the device. // TODO(gvisor.dev/issue/505): Implement. - case syscall.SIOCGIFDSTADDR: + case linux.SIOCGIFDSTADDR: // Gets the destination address of a point-to-point device. // TODO(gvisor.dev/issue/505): Implement. - case syscall.SIOCGIFBRDADDR: + case linux.SIOCGIFBRDADDR: // Gets the broadcast address of a device. // TODO(gvisor.dev/issue/505): Implement. - case syscall.SIOCGIFNETMASK: + case linux.SIOCGIFNETMASK: // Gets the network mask of a device. for _, addr := range stack.InterfaceAddrs()[index] { // This ioctl is only compatible with AF_INET addresses. @@ -2931,6 +3302,14 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe break } + case linux.SIOCETHTOOL: + // Stubbed out for now, Ideally we should implement the required + // sub-commands for ETHTOOL + // + // See: + // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c + return syserr.ErrEndpointOperation + default: // Not a valid call. return syserr.ErrInvalidArgument @@ -2940,7 +3319,7 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe } // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. -func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { +func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error { // If Ptr is NULL, return the necessary buffer size via Len. // Otherwise, write up to Len bytes starting at Ptr containing ifreq // structs. @@ -2977,9 +3356,7 @@ func ifconfIoctl(ctx context.Context, io usermem.IO, ifc *linux.IFConf) error { // Copy the ifr to userspace. dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) ifc.Len += int32(linux.SizeOfIFReq) - if _, err := usermem.CopyObjectOut(ctx, io, usermem.Addr(dst), ifr, usermem.IOOpts{ - AddressSpaceActive: true, - }); err != nil { + if _, err := ifr.CopyOut(t, usermem.Addr(dst)); err != nil { return err } } diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index fcd8013c0..b0d9e4d9e 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -18,12 +18,13 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" - "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" @@ -34,10 +35,13 @@ import ( // SocketVFS2 encapsulates all the state needed to represent a network stack // endpoint in the kernel context. +// +// +stateify savable type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -53,7 +57,8 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu } mnt := t.Kernel().SocketMount() - d := sockfs.NewDentry(t.Credentials(), mnt) + d := sockfs.NewDentry(t, mnt) + defer d.DecRef(t) s := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ @@ -64,6 +69,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu protocol: protocol, }, } + s.LockFD.Init(&vfs.FileLocks{}) vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, @@ -75,6 +81,13 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu return vfsfd, nil } +// Release implements vfs.FileDescriptionImpl.Release. +func (s *SocketVFS2) Release(ctx context.Context) { + t := kernel.TaskFromContext(ctx) + t.Kernel().DeleteSocketVFS2(&s.vfsfd) + s.socketOpsCommon.Release(ctx) +} + // Readiness implements waiter.Waitable.Readiness. func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { return s.socketOpsCommon.Readiness(mask) @@ -147,14 +160,18 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs // tcpip.Endpoint. func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { // Issue the accept request to get the new endpoint. - ep, wq, terr := s.Endpoint.Accept() + var peerAddr *tcpip.FullAddress + if peerRequested { + peerAddr = &tcpip.FullAddress{} + } + ep, wq, terr := s.Endpoint.Accept(peerAddr) if terr != nil { if terr != tcpip.ErrWouldBlock || !blocking { return 0, nil, 0, syserr.TranslateNetstackError(terr) } var err *syserr.Error - ep, wq, err = s.blockingAccept(t) + ep, wq, err = s.blockingAccept(t, peerAddr) if err != nil { return 0, nil, 0, err } @@ -164,7 +181,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, 0, syserr.FromError(err) @@ -172,13 +189,9 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block var addr linux.SockAddr var addrLen uint32 - if peerRequested { + if peerAddr != nil { // Get address of the peer and write it to peer slice. - var err *syserr.Error - addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) - if err != nil { - return 0, nil, 0, err - } + addr, addrLen = ConvertAddress(s.family, *peerAddr) } fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ @@ -197,7 +210,7 @@ func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is // implemented specifically for netstack.SocketVFS2 rather than // commonEndpoint. commonEndpoint should be extended to support socket @@ -207,63 +220,28 @@ func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem. if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptTimestamp { val = 1 } - return val, nil + return &val, nil } if level == linux.SOL_TCP && name == linux.TCP_INQ { if outLen < sizeOfInt32 { return nil, syserr.ErrInvalidArgument } - val := int32(0) + val := primitive.Int32(0) s.readMu.Lock() defer s.readMu.Unlock() if s.sockOptInq { val = 1 } - return val, nil + return &val, nil } - if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { - switch name { - case linux.IPT_SO_GET_INFO: - if outLen < linux.SizeOfIPTGetinfo { - return nil, syserr.ErrInvalidArgument - } - - stack := inet.StackFromContext(t) - if stack == nil { - return nil, syserr.ErrNoDevice - } - info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) - if err != nil { - return nil, err - } - return info, nil - - case linux.IPT_SO_GET_ENTRIES: - if outLen < linux.SizeOfIPTGetEntries { - return nil, syserr.ErrInvalidArgument - } - - stack := inet.StackFromContext(t) - if stack == nil { - return nil, syserr.ErrNoDevice - } - entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) - if err != nil { - return nil, err - } - return entries, nil - - } - } - - return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen) } // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by @@ -293,25 +271,15 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return nil } - if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { - switch name { - case linux.IPT_SO_SET_REPLACE: - if len(optVal) < linux.SizeOfIPTReplace { - return syserr.ErrInvalidArgument - } - - stack := inet.StackFromContext(t) - if stack == nil { - return syserr.ErrNoDevice - } - // Stack must be a netstack stack. - return netfilter.SetEntries(stack.(*Stack).Stack, optVal) - - case linux.IPT_SO_SET_ADD_COUNTERS: - // TODO(gvisor.dev/issue/170): Counter support. - return nil - } - } - return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index f5fa18136..fa9ac9059 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -15,10 +15,11 @@ package netstack import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -41,19 +42,29 @@ func (s *Stack) SupportsIPv6() bool { return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber) } +// Converts Netstack's ARPHardwareType to equivalent linux constants. +func toLinuxARPHardwareType(t header.ARPHardwareType) uint16 { + switch t { + case header.ARPHardwareNone: + return linux.ARPHRD_NONE + case header.ARPHardwareLoopback: + return linux.ARPHRD_LOOPBACK + case header.ARPHardwareEther: + return linux.ARPHRD_ETHER + default: + panic(fmt.Sprintf("unknown ARPHRD type: %d", t)) + } +} + // Interfaces implements inet.Stack.Interfaces. func (s *Stack) Interfaces() map[int32]inet.Interface { is := make(map[int32]inet.Interface) for id, ni := range s.Stack.NICInfo() { - var devType uint16 - if ni.Flags.Loopback { - devType = linux.ARPHRD_LOOPBACK - } is[int32(id)] = inet.Interface{ Name: ni.Name, Addr: []byte(ni.LinkAddress), Flags: uint32(nicStateFlagsToLinux(ni.Flags)), - DeviceType: devType, + DeviceType: toLinuxARPHardwareType(ni.ARPHardwareType), MTU: ni.MTU, } } @@ -89,62 +100,107 @@ func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr { return nicAddrs } -// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. -func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { +// convertAddr converts an InterfaceAddr to a ProtocolAddress. +func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) { var ( - protocol tcpip.NetworkProtocolNumber - address tcpip.Address + protocol tcpip.NetworkProtocolNumber + address tcpip.Address + protocolAddress tcpip.ProtocolAddress ) switch addr.Family { case linux.AF_INET: - if len(addr.Addr) < header.IPv4AddressSize { - return syserror.EINVAL + if len(addr.Addr) != header.IPv4AddressSize { + return protocolAddress, syserror.EINVAL } if addr.PrefixLen > header.IPv4AddressSize*8 { - return syserror.EINVAL + return protocolAddress, syserror.EINVAL } protocol = ipv4.ProtocolNumber - address = tcpip.Address(addr.Addr[:header.IPv4AddressSize]) - + address = tcpip.Address(addr.Addr) case linux.AF_INET6: - if len(addr.Addr) < header.IPv6AddressSize { - return syserror.EINVAL + if len(addr.Addr) != header.IPv6AddressSize { + return protocolAddress, syserror.EINVAL } if addr.PrefixLen > header.IPv6AddressSize*8 { - return syserror.EINVAL + return protocolAddress, syserror.EINVAL } protocol = ipv6.ProtocolNumber - address = tcpip.Address(addr.Addr[:header.IPv6AddressSize]) - + address = tcpip.Address(addr.Addr) default: - return syserror.ENOTSUP + return protocolAddress, syserror.ENOTSUP } - protocolAddress := tcpip.ProtocolAddress{ + protocolAddress = tcpip.ProtocolAddress{ Protocol: protocol, AddressWithPrefix: tcpip.AddressWithPrefix{ Address: address, PrefixLen: int(addr.PrefixLen), }, } + return protocolAddress, nil +} + +// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr. +func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + protocolAddress, err := convertAddr(addr) + if err != nil { + return err + } // Attach address to interface. - if err := s.Stack.AddProtocolAddressWithOptions(tcpip.NICID(idx), protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + nicID := tcpip.NICID(idx) + if err := s.Stack.AddProtocolAddressWithOptions(nicID, protocolAddress, stack.CanBePrimaryEndpoint); err != nil { + return syserr.TranslateNetstackError(err).ToError() + } + + // Add route for local network if it doesn't exist already. + localRoute := tcpip.Route{ + Destination: protocolAddress.AddressWithPrefix.Subnet(), + Gateway: "", // No gateway for local network. + NIC: nicID, + } + + for _, rt := range s.Stack.GetRouteTable() { + if rt.Equal(localRoute) { + return nil + } + } + + // Local route does not exist yet. Add it. + s.Stack.AddRoute(localRoute) + + return nil +} + +// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr. +func (s *Stack) RemoveInterfaceAddr(idx int32, addr inet.InterfaceAddr) error { + protocolAddress, err := convertAddr(addr) + if err != nil { + return err + } + + // Remove addresses matching the address and prefix. + nicID := tcpip.NICID(idx) + if err := s.Stack.RemoveAddress(nicID, protocolAddress.AddressWithPrefix.Address); err != nil { return syserr.TranslateNetstackError(err).ToError() } - // Add route for local network. - s.Stack.AddRoute(tcpip.Route{ + // Remove the corresponding local network route if it exists. + localRoute := tcpip.Route{ Destination: protocolAddress.AddressWithPrefix.Subnet(), Gateway: "", // No gateway for local network. - NIC: tcpip.NICID(idx), + NIC: nicID, + } + s.Stack.RemoveRoutes(func(rt tcpip.Route) bool { + return rt.Equal(localRoute) }) + return nil } // TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize. func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { - var rs tcp.ReceiveBufferSizeOption + var rs tcpip.TCPReceiveBufferSizeRangeOption err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs) return inet.TCPBufferSize{ Min: rs.Min, @@ -155,17 +211,17 @@ func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) { // SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize. func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error { - rs := tcp.ReceiveBufferSizeOption{ + rs := tcpip.TCPReceiveBufferSizeRangeOption{ Min: size.Min, Default: size.Default, Max: size.Max, } - return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, rs)).ToError() + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError() } // TCPSendBufferSize implements inet.Stack.TCPSendBufferSize. func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { - var ss tcp.SendBufferSizeOption + var ss tcpip.TCPSendBufferSizeRangeOption err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss) return inet.TCPBufferSize{ Min: ss.Min, @@ -176,24 +232,40 @@ func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) { // SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize. func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error { - ss := tcp.SendBufferSizeOption{ + ss := tcpip.TCPSendBufferSizeRangeOption{ Min: size.Min, Default: size.Default, Max: size.Max, } - return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, ss)).ToError() + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError() } // TCPSACKEnabled implements inet.Stack.TCPSACKEnabled. func (s *Stack) TCPSACKEnabled() (bool, error) { - var sack tcp.SACKEnabled + var sack tcpip.TCPSACKEnabled err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack) return bool(sack), syserr.TranslateNetstackError(err).ToError() } // SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled. func (s *Stack) SetTCPSACKEnabled(enabled bool) error { - return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError() + opt := tcpip.TCPSACKEnabled(enabled) + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError() +} + +// TCPRecovery implements inet.Stack.TCPRecovery. +func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { + var recovery tcpip.TCPRecovery + if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + return inet.TCPLossRecovery(recovery), nil +} + +// SetTCPRecovery implements inet.Stack.SetTCPRecovery. +func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error { + opt := tcpip.TCPRecovery(recovery) + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError() } // Statistics implements inet.Stack.Statistics. @@ -314,7 +386,7 @@ func (s *Stack) Statistics(stat interface{}, arg string) error { udp.PacketsSent.Value(), // OutDatagrams. udp.ReceiveBufferErrors.Value(), // RcvbufErrors. 0, // Udp/SndbufErrors. - 0, // Udp/InCsumErrors. + udp.ChecksumErrors.Value(), // Udp/InCsumErrors. 0, // Udp/IgnoredMulti. } default: @@ -362,16 +434,10 @@ func (s *Stack) RouteTable() []inet.Route { } // IPTables returns the stack's iptables. -func (s *Stack) IPTables() (stack.IPTables, error) { +func (s *Stack) IPTables() (*stack.IPTables, error) { return s.Stack.IPTables(), nil } -// FillDefaultIPTables sets the stack's iptables to the default tables, which -// allow and do not modify all traffic. -func (s *Stack) FillDefaultIPTables() { - netfilter.FillDefaultIPTables(s.Stack) -} - // Resume implements inet.Stack.Resume. func (s *Stack) Resume() { s.Stack.Resume() @@ -391,3 +457,24 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) { s.Stack.RestoreCleanupEndpoints(es) } + +// Forwarding implements inet.Stack.Forwarding. +func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { + switch protocol { + case ipv4.ProtocolNumber, ipv6.ProtocolNumber: + return s.Stack.Forwarding(protocol) + default: + panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol)) + } +} + +// SetForwarding implements inet.Stack.SetForwarding. +func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { + switch protocol { + case ipv4.ProtocolNumber, ipv6.ProtocolNumber: + s.Stack.SetForwarding(protocol, enable) + default: + panic(fmt.Sprintf("SetForwarding(%v) failed: unsupported protocol", protocol)) + } + return nil +} diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 6580bd6e9..fd31479e5 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -45,8 +46,8 @@ type ControlMessages struct { } // Release releases Unix domain socket credentials and rights. -func (c *ControlMessages) Release() { - c.Unix.Release() +func (c *ControlMessages) Release(ctx context.Context) { + c.Unix.Release(ctx) } // Socket is an interface combining fs.FileOperations and SocketOps, @@ -86,7 +87,7 @@ type SocketOps interface { Shutdown(t *kernel.Task, how int) *syserr.Error // GetSockOpt implements the getsockopt(2) linux syscall. - GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) + GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) // SetSockOpt implements the setsockopt(2) linux syscall. SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error @@ -407,7 +408,6 @@ func emitUnimplementedEvent(t *kernel.Task, name int) { linux.SO_MARK, linux.SO_MAX_PACING_RATE, linux.SO_NOFCS, - linux.SO_NO_CHECK, linux.SO_OOBINLINE, linux.SO_PASSCRED, linux.SO_PASSSEC, diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index de2cc4bdf..cce0acc33 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -1,12 +1,37 @@ load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +go_template_instance( + name = "socket_refs", + out = "socket_refs.go", + package = "unix", + prefix = "socketOperations", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "SocketOperations", + }, +) + +go_template_instance( + name = "socket_vfs2_refs", + out = "socket_vfs2_refs.go", + package = "unix", + prefix = "socketVFS2", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "SocketVFS2", + }, +) + go_library( name = "unix", srcs = [ "device.go", "io.go", + "socket_refs.go", + "socket_vfs2_refs.go", "unix.go", "unix_vfs2.go", ], @@ -15,12 +40,16 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/log", + "//pkg/marshal", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index c708b6030..3ebbd28b0 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "queue_refs", + out = "queue_refs.go", + package = "transport", + prefix = "queue", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "queue", + }, +) + go_library( name = "transport", srcs = [ @@ -22,6 +33,7 @@ go_library( "connectioned_state.go", "connectionless.go", "queue.go", + "queue_refs.go", "transport_message_list.go", "unix.go", ], @@ -32,6 +44,7 @@ go_library( "//pkg/ilist", "//pkg/log", "//pkg/refs", + "//pkg/refsvfs2", "//pkg/sync", "//pkg/syserr", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index ce5b94ee7..aa4f3c04d 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E } q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} - q1.EnableLeakCheck("transport.queue") + q1.EnableLeakCheck() q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} - q2.EnableLeakCheck("transport.queue") + q2.EnableLeakCheck() if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} @@ -211,7 +211,7 @@ func (e *connectionedEndpoint) Listening() bool { // The socket will be a fresh state after a call to close and may be reused. // That is, close may be used to "unbind" or "disconnect" the socket in error // paths. -func (e *connectionedEndpoint) Close() { +func (e *connectionedEndpoint) Close(ctx context.Context) { e.Lock() var c ConnectedEndpoint var r Receiver @@ -233,7 +233,7 @@ func (e *connectionedEndpoint) Close() { case e.Listening(): close(e.acceptedChan) for n := range e.acceptedChan { - n.Close() + n.Close(ctx) } e.acceptedChan = nil e.path = "" @@ -241,18 +241,18 @@ func (e *connectionedEndpoint) Close() { e.Unlock() if c != nil { c.CloseNotify() - c.Release() + c.Release(ctx) } if r != nil { r.CloseNotify() - r.Release() + r.Release(ctx) } } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error { if ce.Type() != e.stype { - return syserr.ErrConnectionRefused + return syserr.ErrWrongProtocolForSocket } // Check if ce is e to avoid a deadlock. @@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn } readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} - readQueue.EnableLeakCheck("transport.queue") + readQueue.EnableLeakCheck() ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} - writeQueue.EnableLeakCheck("transport.queue") + writeQueue.EnableLeakCheck() if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { @@ -340,7 +340,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn return nil default: // Busy; return ECONNREFUSED per spec. - ne.Close() + ne.Close(ctx) e.Unlock() ce.Unlock() return syserr.ErrConnectionRefused @@ -391,7 +391,7 @@ func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error { } // Accept accepts a new connection. -func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) { +func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) { e.Lock() defer e.Unlock() @@ -401,6 +401,18 @@ func (e *connectionedEndpoint) Accept() (Endpoint, *syserr.Error) { select { case ne := <-e.acceptedChan: + if peerAddr != nil { + ne.Lock() + c := ne.connected + ne.Unlock() + if c != nil { + addr, err := c.GetLocalAddress() + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + *peerAddr = addr + } + } return ne, nil default: @@ -476,6 +488,9 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask // State implements socket.Socket.State. func (e *connectionedEndpoint) State() uint32 { + e.Lock() + defer e.Unlock() + if e.Connected() { return linux.SS_CONNECTED } diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 4b06d63ac..f8aacca13 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -42,7 +42,7 @@ var ( func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} - q.EnableLeakCheck("transport.queue") + q.EnableLeakCheck() ep.receiver = &queueReceiver{readQueue: &q} return ep } @@ -54,10 +54,10 @@ func (e *connectionlessEndpoint) isBound() bool { // Close puts the endpoint in a closed state and frees all resources associated // with it. -func (e *connectionlessEndpoint) Close() { +func (e *connectionlessEndpoint) Close(ctx context.Context) { e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) e.connected = nil } @@ -71,7 +71,7 @@ func (e *connectionlessEndpoint) Close() { e.Unlock() r.CloseNotify() - r.Release() + r.Release(ctx) } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. @@ -108,10 +108,10 @@ func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c C if err != nil { return 0, syserr.ErrInvalidEndpointState } - defer connected.Release() + defer connected.Release(ctx) e.Lock() - n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -135,7 +135,7 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) } e.connected = connected e.Unlock() @@ -144,12 +144,12 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi } // Listen starts listening on the connection. -func (e *connectionlessEndpoint) Listen(int) *syserr.Error { +func (*connectionlessEndpoint) Listen(int) *syserr.Error { return syserr.ErrNotSupported } // Accept accepts a new connection. -func (e *connectionlessEndpoint) Accept() (Endpoint, *syserr.Error) { +func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) { return nil, syserr.ErrNotSupported } diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index d8f3ad63d..342def28f 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -15,7 +15,7 @@ package transport import ( - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" @@ -27,7 +27,7 @@ import ( // // +stateify savable type queue struct { - refs.AtomicRefCount + queueRefs ReaderQueue *waiter.Queue WriterQueue *waiter.Queue @@ -57,21 +57,23 @@ func (q *queue) Close() { // Both the read and write queues must be notified after resetting: // q.ReaderQueue.Notify(waiter.EventIn) // q.WriterQueue.Notify(waiter.EventOut) -func (q *queue) Reset() { +func (q *queue) Reset(ctx context.Context) { q.mu.Lock() for cur := q.dataList.Front(); cur != nil; cur = cur.Next() { - cur.Release() + cur.Release(ctx) } q.dataList.Reset() q.used = 0 q.mu.Unlock() } -// DecRef implements RefCounter.DecRef with destructor q.Reset. -func (q *queue) DecRef() { - q.DecRefWithDestructor(q.Reset) - // We don't need to notify after resetting because no one cares about - // this queue after all references have been dropped. +// DecRef implements RefCounter.DecRef. +func (q *queue) DecRef(ctx context.Context) { + q.queueRefs.DecRef(func() { + // We don't need to notify after resetting because no one cares about + // this queue after all references have been dropped. + q.Reset(ctx) + }) } // IsReadable determines if q is currently readable. @@ -111,7 +113,7 @@ func (q *queue) IsWritable() bool { // // If notify is true, ReaderQueue.Notify must be called: // q.ReaderQueue.Notify(waiter.EventIn) -func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { +func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { q.mu.Lock() if q.closed { @@ -124,7 +126,7 @@ func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress } if discardEmpty && l == 0 { q.mu.Unlock() - c.Release() + c.Release(ctx) return 0, false, nil } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 2f1b127df..b648273a4 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -32,12 +32,14 @@ import ( const initialLimit = 16 * 1024 // A RightsControlMessage is a control message containing FDs. +// +// +stateify savable type RightsControlMessage interface { // Clone returns a copy of the RightsControlMessage. Clone() RightsControlMessage // Release releases any resources owned by the RightsControlMessage. - Release() + Release(ctx context.Context) } // A CredentialsControlMessage is a control message containing Unix credentials. @@ -74,9 +76,9 @@ func (c *ControlMessages) Clone() ControlMessages { } // Release releases both the credentials and the rights. -func (c *ControlMessages) Release() { +func (c *ControlMessages) Release(ctx context.Context) { if c.Rights != nil { - c.Rights.Release() + c.Rights.Release(ctx) } *c = ControlMessages{} } @@ -90,7 +92,7 @@ type Endpoint interface { // Close puts the endpoint in a closed state and frees all resources // associated with it. - Close() + Close(ctx context.Context) // RecvMsg reads data and a control message from the endpoint. This method // does not block if there is no data pending. @@ -151,7 +153,10 @@ type Endpoint interface { // block if no new connections are available. // // The returned Queue is the wait queue for the newly created endpoint. - Accept() (Endpoint, *syserr.Error) + // + // peerAddr if not nil will be populated with the address of the connected + // peer on a successful accept. + Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) // Bind binds the endpoint to a specific local address and port. // Specifying a NIC is optional. @@ -172,9 +177,8 @@ type Endpoint interface { // connected. GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) - // SetSockOpt sets a socket option. opt should be one of the tcpip.*Option - // types. - SetSockOpt(opt interface{}) *tcpip.Error + // SetSockOpt sets a socket option. + SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error // SetSockOptBool sets a socket option for simple cases when a value has // the int type. @@ -184,9 +188,8 @@ type Endpoint interface { // the int type. SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error - // GetSockOpt gets a socket option. opt should be a pointer to one of the - // tcpip.*Option types. - GetSockOpt(opt interface{}) *tcpip.Error + // GetSockOpt gets a socket option. + GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error // GetSockOptBool gets a socket option for simple cases when a return // value has the int type. @@ -199,6 +202,9 @@ type Endpoint interface { // State returns the current state of the socket, as represented by Linux in // procfs. State() uint32 + + // LastError implements tcpip.Endpoint.LastError. + LastError() *tcpip.Error } // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket @@ -252,7 +258,7 @@ type BoundEndpoint interface { // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. - Release() + Release(ctx context.Context) } // message represents a message passed over a Unix domain socket. @@ -281,8 +287,8 @@ func (m *message) Length() int64 { } // Release releases any resources held by the message. -func (m *message) Release() { - m.Control.Release() +func (m *message) Release(ctx context.Context) { + m.Control.Release(ctx) } // Peek returns a copy of the message. @@ -304,7 +310,7 @@ type Receiver interface { // See Endpoint.RecvMsg for documentation on shared arguments. // // notify indicates if RecvNotify should be called. - Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) + Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) // RecvNotify notifies the Receiver of a successful Recv. This must not be // called while holding any endpoint locks. @@ -332,8 +338,8 @@ type Receiver interface { RecvMaxQueueSize() int64 // Release releases any resources owned by the Receiver. It should be - // called before droping all references to a Receiver. - Release() + // called before dropping all references to a Receiver. + Release(ctx context.Context) } // queueReceiver implements Receiver for datagram sockets. @@ -344,7 +350,7 @@ type queueReceiver struct { } // Recv implements Receiver.Recv. -func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { var m *message var notify bool var err *syserr.Error @@ -398,8 +404,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 { } // Release implements Receiver.Release. -func (q *queueReceiver) Release() { - q.readQueue.DecRef() +func (q *queueReceiver) Release(ctx context.Context) { + q.readQueue.DecRef(ctx) } // streamQueueReceiver implements Receiver for stream sockets. @@ -456,7 +462,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { } // Recv implements Receiver.Recv. -func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { q.mu.Lock() defer q.mu.Unlock() @@ -483,7 +489,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, c := q.control.Clone() // Don't consume data since we are peeking. - copied, data, _ = vecCopy(data, q.buffer) + copied, _, _ = vecCopy(data, q.buffer) return copied, copied, c, false, q.addr, notify, nil } @@ -502,7 +508,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, var cmTruncated bool if c.Rights != nil && numRights == 0 { - c.Rights.Release() + c.Rights.Release(ctx) c.Rights = nil cmTruncated = true } @@ -557,7 +563,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, // Consume rights. if numRights == 0 { cmTruncated = true - q.control.Rights.Release() + q.control.Rights.Release(ctx) } else { c.Rights = q.control.Rights haveRights = true @@ -568,6 +574,12 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, return copied, copied, c, cmTruncated, q.addr, notify, nil } +// Release implements Receiver.Release. +func (q *streamQueueReceiver) Release(ctx context.Context) { + q.queueReceiver.Release(ctx) + q.control.Release(ctx) +} + // A ConnectedEndpoint is an Endpoint that can be used to send Messages. type ConnectedEndpoint interface { // Passcred implements Endpoint.Passcred. @@ -582,7 +594,7 @@ type ConnectedEndpoint interface { // // syserr.ErrWouldBlock can be returned along with a partial write if // the caller should block to send the rest of the data. - Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) + Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) // SendNotify notifies the ConnectedEndpoint of a successful Send. This // must not be called while holding any endpoint locks. @@ -615,8 +627,8 @@ type ConnectedEndpoint interface { SendMaxQueueSize() int64 // Release releases any resources owned by the ConnectedEndpoint. It should - // be called before droping all references to a ConnectedEndpoint. - Release() + // be called before dropping all references to a ConnectedEndpoint. + Release(ctx context.Context) // CloseUnread sets the fact that this end is closed with unread data to // the peer socket. @@ -654,7 +666,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) } // Send implements ConnectedEndpoint.Send. -func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { discardEmpty := false truncate := false if e.endpoint.Type() == linux.SOCK_STREAM { @@ -669,7 +681,7 @@ func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.Fu truncate = true } - return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate) + return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) } // SendNotify implements ConnectedEndpoint.SendNotify. @@ -707,8 +719,8 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 { } // Release implements ConnectedEndpoint.Release. -func (e *connectedEndpoint) Release() { - e.writeQueue.DecRef() +func (e *connectedEndpoint) Release(ctx context.Context) { + e.writeQueue.DecRef(ctx) } // CloseUnread implements ConnectedEndpoint.CloseUnread. @@ -742,6 +754,9 @@ type baseEndpoint struct { // path is not empty if the endpoint has been bound, // or may be used if the endpoint is connected. path string + + // linger is used for SO_LINGER socket option. + linger tcpip.LingerOption } // EventRegister implements waiter.Waitable.EventRegister. @@ -798,7 +813,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected } - recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek) + recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(ctx, data, creds, numRights, peek) e.Unlock() if err != nil { return 0, 0, ControlMessages{}, false, err @@ -827,7 +842,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return 0, syserr.ErrAlreadyConnected } - n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := e.connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -837,8 +852,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return n, err } -// SetSockOpt sets a socket option. Currently not supported. -func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error { +// SetSockOpt sets a socket option. +func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error { + switch v := opt.(type) { + case *tcpip.LingerOption: + e.Lock() + e.linger = *v + e.Unlock() + } return nil } @@ -866,7 +887,7 @@ func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error { func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) { switch opt { - case tcpip.KeepaliveEnabledOption: + case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption: return false, nil case tcpip.PasscredOption: @@ -940,9 +961,12 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) { } // GetSockOpt implements tcpip.Endpoint.GetSockOpt. -func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { - switch opt.(type) { - case tcpip.ErrorOption: +func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error { + switch o := opt.(type) { + case *tcpip.LingerOption: + e.Lock() + *o = e.linger + e.Unlock() return nil default: @@ -951,6 +975,11 @@ func (e *baseEndpoint) GetSockOpt(opt interface{}) *tcpip.Error { } } +// LastError implements Endpoint.LastError. +func (*baseEndpoint) LastError() *tcpip.Error { + return nil +} + // Shutdown closes the read and/or write end of the endpoint connection to its // peer. func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { @@ -1001,6 +1030,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { } // Release implements BoundEndpoint.Release. -func (*baseEndpoint) Release() { +func (*baseEndpoint) Release(context.Context) { // Binding a baseEndpoint doesn't take a reference. } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 5b29e9d7f..adad485a9 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -55,13 +55,14 @@ type SocketOperations struct { fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + socketOperationsRefs socketOpsCommon } // New creates a new unix socket. func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File { dirent := socket.NewDirent(ctx, unixSocketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true}) } @@ -79,34 +80,41 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty stype: stype, }, } - s.EnableLeakCheck("unix.SocketOperations") - + s.EnableLeakCheck() return fs.NewFile(ctx, d, flags, &s) } +// DecRef implements RefCounter.DecRef. +func (s *SocketOperations) DecRef(ctx context.Context) { + s.socketOperationsRefs.DecRef(func() { + s.ep.Close(ctx) + if s.abstractNamespace != nil { + s.abstractNamespace.Remove(s.abstractName, s) + } + }) +} + +// Release implemements fs.FileOperations.Release. +func (s *SocketOperations) Release(ctx context.Context) { + // Release only decrements a reference on s because s may be referenced in + // the abstract socket namespace. + s.DecRef(ctx) +} + // socketOpsCommon contains the socket operations common to VFS1 and VFS2. // // +stateify savable type socketOpsCommon struct { - refs.AtomicRefCount socket.SendReceiveTimeout ep transport.Endpoint stype linux.SockType -} -// DecRef implements RefCounter.DecRef. -func (s *socketOpsCommon) DecRef() { - s.DecRefWithDestructor(func() { - s.ep.Close() - }) -} - -// Release implemements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { - // Release only decrements a reference on s because s may be referenced in - // the abstract socket namespace. - s.DecRef() + // abstractName and abstractNamespace indicate the name and namespace of the + // socket if it is bound to an abstract socket namespace. Once the socket is + // bound, they cannot be modified. + abstractName string + abstractNamespace *kernel.AbstractSocketNamespace } func (s *socketOpsCommon) isPacket() bool { @@ -184,8 +192,8 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { - return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen) } // Listen implements the linux syscall listen(2) for sockets backed by @@ -196,7 +204,7 @@ func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. -func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { +func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(nil) s.EventRegister(&e, waiter.EventIn) @@ -205,7 +213,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, * // Try to accept the connection; if it fails, then wait until we get a // notification. for { - if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock { return ep, err } @@ -218,22 +226,25 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, * // Accept implements the linux syscall accept(2) for sockets backed by // a transport.Endpoint. func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { - // Issue the accept request to get the new endpoint. - ep, err := s.ep.Accept() + var peerAddr *tcpip.FullAddress + if peerRequested { + peerAddr = &tcpip.FullAddress{} + } + ep, err := s.ep.Accept(peerAddr) if err != nil { if err != syserr.ErrWouldBlock || !blocking { return 0, nil, 0, err } var err *syserr.Error - ep, err = s.blockingAccept(t) + ep, err = s.blockingAccept(t, peerAddr) if err != nil { return 0, nil, 0, err } } ns := New(t, ep, s.stype) - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() @@ -243,13 +254,8 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, var addr linux.SockAddr var addrLen uint32 - if peerRequested { - // Get address of the peer. - var err *syserr.Error - addr, addrLen, err = ns.FileOperations.(*SocketOperations).GetPeerName(t) - if err != nil { - return 0, nil, 0, err - } + if peerAddr != nil { + addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr) } fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ @@ -283,17 +289,21 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + asn := t.AbstractSockets() + name := p[1:] + if err := asn.Bind(t, name, bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } + s.abstractName = name + s.abstractNamespace = asn } else { // The parent and name. var d *fs.Dirent var name string cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) // Is there no slash at all? if !strings.Contains(p, "/") { @@ -301,7 +311,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { name = p } else { root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Find the last path component, we know that something follows // that final slash, otherwise extractPath() would have failed. lastSlash := strings.LastIndex(p, "/") @@ -317,7 +327,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // No path available. return syserr.ErrNoSuchFile } - defer d.DecRef() + defer d.DecRef(t) name = p[lastSlash+1:] } @@ -331,7 +341,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if err != nil { return syserr.ErrPortInUse } - childDir.DecRef() + childDir.DecRef(t) } return nil @@ -377,9 +387,9 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, FollowFinalSymlink: true, } ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) - root.DecRef() + root.DecRef(t) if relPath { - start.DecRef() + start.DecRef(t) } if e != nil { return nil, syserr.FromError(e) @@ -392,15 +402,15 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, cwd := t.FSContext().WorkingDirectory() remainingTraversals := uint(fs.DefaultTraversalLimit) d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals) - cwd.DecRef() - root.DecRef() + cwd.DecRef(t) + root.DecRef(t) if e != nil { return nil, syserr.FromError(e) } // Extract the endpoint if one is there. ep := d.Inode.BoundEndpoint(path) - d.DecRef() + d.DecRef(t) if ep == nil { // No socket! return nil, syserr.ErrConnectionRefused @@ -414,10 +424,21 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool if err != nil { return err } - defer ep.Release() + defer ep.Release(t) // Connect the server endpoint. - return s.ep.Connect(t, ep) + err = s.ep.Connect(t, ep) + + if err == syserr.ErrWrongProtocolForSocket { + // Linux for abstract sockets returns ErrConnectionRefused + // instead of ErrWrongProtocolForSocket. + path, _ := extractPath(sockaddr) + if len(path) > 0 && path[0] == 0 { + err = syserr.ErrConnectionRefused + } + } + + return err } // Write implements fs.FileOperations.Write. @@ -448,15 +469,25 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b To: nil, } if len(to) > 0 { - ep, err := extractEndpoint(t, to) - if err != nil { - return 0, err - } - defer ep.Release() - w.To = ep + switch s.stype { + case linux.SOCK_SEQPACKET: + to = nil + case linux.SOCK_STREAM: + if s.State() == linux.SS_CONNECTED { + return 0, syserr.ErrAlreadyConnected + } + return 0, syserr.ErrNotSupported + default: + ep, err := extractEndpoint(t, to) + if err != nil { + return 0, err + } + defer ep.Release(t) + w.To = ep - if ep.Passcred() && w.Control.Credentials == nil { - w.Control.Credentials = control.MakeCreds(t) + if ep.Passcred() && w.Control.Credentials == nil { + w.Control.Credentials = control.MakeCreds(t) + } } } @@ -541,13 +572,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS if dst.NumBytes() == 0 { return 0, nil } - return dst.CopyOutFrom(ctx, &EndpointReader{ + r := &EndpointReader{ Ctx: ctx, Endpoint: s.ep, NumRights: 0, Peek: false, From: nil, - }) + } + n, err := dst.CopyOutFrom(ctx, r) + // Drop control messages. + r.Control.Release(ctx) + return n, err } // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 45e109361..7a78444dc 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -18,7 +18,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -35,11 +37,15 @@ import ( // SocketVFS2 implements socket.SocketVFS2 (and by extension, // vfs.FileDescriptionImpl) for Unix sockets. +// +// +stateify savable type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD + socketVFS2Refs socketOpsCommon } @@ -49,9 +55,10 @@ var _ = socket.SocketVFS2(&SocketVFS2{}) // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - d := sockfs.NewDentry(t.Credentials(), mnt) + d := sockfs.NewDentry(t, mnt) + defer d.DecRef(t) - fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{}) if err != nil { return nil, syserr.FromError(err) } @@ -60,7 +67,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) // NewFileDescription creates and returns a socket file description // corresponding to the given mount and dentry. -func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) { +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { @@ -73,6 +80,8 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3 stype: stype, }, } + sock.EnableLeakCheck() + sock.LockFD.Init(locks) vfsfd := &sock.vfsfd if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, @@ -84,15 +93,34 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3 return vfsfd, nil } +// DecRef implements RefCounter.DecRef. +func (s *SocketVFS2) DecRef(ctx context.Context) { + s.socketVFS2Refs.DecRef(func() { + t := kernel.TaskFromContext(ctx) + t.Kernel().DeleteSocketVFS2(&s.vfsfd) + s.ep.Close(ctx) + if s.abstractNamespace != nil { + s.abstractNamespace.Remove(s.abstractName, s) + } + }) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (s *SocketVFS2) Release(ctx context.Context) { + // Release only decrements a reference on s because s may be referenced in + // the abstract socket namespace. + s.DecRef(ctx) +} + // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by // a transport.Endpoint. -func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { - return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen) } // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. -func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { +func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(nil) s.socketOpsCommon.EventRegister(&e, waiter.EventIn) @@ -101,7 +129,7 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr // Try to accept the connection; if it fails, then wait until we get a // notification. for { - if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock { return ep, err } @@ -114,15 +142,18 @@ func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr // Accept implements the linux syscall accept(2) for sockets backed by // a transport.Endpoint. func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { - // Issue the accept request to get the new endpoint. - ep, err := s.ep.Accept() + var peerAddr *tcpip.FullAddress + if peerRequested { + peerAddr = &tcpip.FullAddress{} + } + ep, err := s.ep.Accept(peerAddr) if err != nil { if err != syserr.ErrWouldBlock || !blocking { return 0, nil, 0, err } var err *syserr.Error - ep, err = s.blockingAccept(t) + ep, err = s.blockingAccept(t, peerAddr) if err != nil { return 0, nil, 0, err } @@ -132,7 +163,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) @@ -140,13 +171,8 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block var addr linux.SockAddr var addrLen uint32 - if peerRequested { - // Get address of the peer. - var err *syserr.Error - addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) - if err != nil { - return 0, nil, 0, err - } + if peerAddr != nil { + addr, addrLen = netstack.ConvertAddress(linux.AF_UNIX, *peerAddr) } fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ @@ -179,19 +205,23 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + asn := t.AbstractSockets() + name := p[1:] + if err := asn.Bind(t, name, bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } + s.abstractName = name + s.abstractNamespace = asn } else { path := fspath.Parse(p) root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root relPath := !path.Absolute if relPath { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } pop := vfs.PathOperation{ Root: root, @@ -238,13 +268,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. if dst.NumBytes() == 0 { return 0, nil } - return dst.CopyOutFrom(ctx, &EndpointReader{ + r := &EndpointReader{ Ctx: ctx, Endpoint: s.ep, NumRights: 0, Peek: false, From: nil, - }) + } + n, err := dst.CopyOutFrom(ctx, r) + // Drop control messages. + r.Control.Release(ctx) + return n, err } // PWrite implements vfs.FileDescriptionImpl. @@ -297,6 +331,16 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + // providerVFS2 is a unix domain socket provider for VFS2. type providerVFS2 struct{} @@ -319,7 +363,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) f, err := NewSockfsFile(t, ep, stype) if err != nil { - ep.Close() + ep.Close(t) return nil, err } return f, nil @@ -343,14 +387,14 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (* ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) s1, err := NewSockfsFile(t, ep1, stype) if err != nil { - ep1.Close() - ep2.Close() + ep1.Close(t) + ep2.Close(t) return nil, nil, err } s2, err := NewSockfsFile(t, ep2, stype) if err != nil { - s1.DecRef() - ep2.Close() + s1.DecRef(t) + ep2.Close(t) return nil, nil, err } diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD index 0ea4aab8b..563d60578 100644 --- a/pkg/sentry/state/BUILD +++ b/pkg/sentry/state/BUILD @@ -12,10 +12,12 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/time", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/state/statefile", "//pkg/syserror", diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go index 9eb626b76..167754537 100644 --- a/pkg/sentry/state/state.go +++ b/pkg/sentry/state/state.go @@ -19,10 +19,12 @@ import ( "fmt" "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/pkg/syserror" @@ -57,11 +59,14 @@ type SaveOpts struct { } // Save saves the system state. -func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error { +func (opts SaveOpts) Save(ctx context.Context, k *kernel.Kernel, w *watchdog.Watchdog) error { log.Infof("Sandbox save started, pausing all tasks.") k.Pause() - defer k.Unpause() - defer log.Infof("Tasks resumed after save.") + k.ReceiveTaskStates() + defer func() { + k.Unpause() + log.Infof("Tasks resumed after save.") + }() w.Stop() defer w.Start() @@ -78,7 +83,7 @@ func (opts SaveOpts) Save(k *kernel.Kernel, w *watchdog.Watchdog) error { err = ErrStateFile{err} } else { // Save the kernel. - err = k.SaveTo(wc) + err = k.SaveTo(ctx, wc) // ENOSPC is a state file error. This error can only come from // writing the state file, and not from fs.FileOperations.Fsync @@ -105,7 +110,7 @@ type LoadOpts struct { } // Load loads the given kernel, setting the provided platform and stack. -func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) error { +func (opts LoadOpts) Load(ctx context.Context, k *kernel.Kernel, n inet.Stack, clocks time.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error { // Open the file. r, m, err := statefile.NewReader(opts.Source, opts.Key) if err != nil { @@ -115,5 +120,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) er previousMetadata = m // Restore the Kernel object graph. - return k.LoadFrom(r, n, clocks) + return k.LoadFrom(ctx, r, n, clocks, vfsOpts) } diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index 88d5db9fc..a920180d3 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/binary", "//pkg/bits", "//pkg/eventchannel", + "//pkg/marshal/primitive", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/kernel", diff --git a/pkg/sentry/strace/epoll.go b/pkg/sentry/strace/epoll.go index a6e48b836..ae3b998c8 100644 --- a/pkg/sentry/strace/epoll.go +++ b/pkg/sentry/strace/epoll.go @@ -26,7 +26,7 @@ import ( func epollEvent(t *kernel.Task, eventAddr usermem.Addr) string { var e linux.EpollEvent - if _, err := t.CopyIn(eventAddr, &e); err != nil { + if _, err := e.CopyIn(t, eventAddr); err != nil { return fmt.Sprintf("%#x {error reading event: %v}", eventAddr, err) } var sb strings.Builder @@ -41,7 +41,7 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui addr := eventsAddr for i := uint64(0); i < numEvents; i++ { var e linux.EpollEvent - if _, err := t.CopyIn(addr, &e); err != nil { + if _, err := e.CopyIn(t, addr); err != nil { fmt.Fprintf(&sb, "{error reading event at %#x: %v}", addr, err) continue } @@ -50,10 +50,10 @@ func epollEvents(t *kernel.Task, eventsAddr usermem.Addr, numEvents, maxBytes ui sb.WriteString("...") break } - if _, ok := addr.AddLength(uint64(linux.SizeOfEpollEvent)); !ok { - fmt.Fprintf(&sb, "{error reading event at %#x: EFAULT}", addr) - continue - } + // Allowing addr to overflow is consistent with Linux, and harmless; if + // this isn't the last iteration of the loop, the next call to CopyIn + // will just fail with EFAULT. + addr, _ = addr.AddLength(uint64(linux.SizeOfEpollEvent)) } sb.WriteString("}") return sb.String() @@ -75,7 +75,7 @@ var epollEventEvents = abi.FlagSet{ {Flag: linux.EPOLLPRI, Name: "EPOLLPRI"}, {Flag: linux.EPOLLOUT, Name: "EPOLLOUT"}, {Flag: linux.EPOLLERR, Name: "EPOLLERR"}, - {Flag: linux.EPOLLHUP, Name: "EPULLHUP"}, + {Flag: linux.EPOLLHUP, Name: "EPOLLHUP"}, {Flag: linux.EPOLLRDNORM, Name: "EPOLLRDNORM"}, {Flag: linux.EPOLLRDBAND, Name: "EPOLLRDBAND"}, {Flag: linux.EPOLLWRNORM, Name: "EPOLLWRNORM"}, diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index c0512de89..cc5f70cd4 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" @@ -166,7 +167,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) } buf := make([]byte, length) - if _, err := t.CopyIn(addr, &buf); err != nil { + if _, err := t.CopyInBytes(addr, buf); err != nil { return fmt.Sprintf("%#x (error decoding control: %v)", addr, err) } @@ -302,7 +303,7 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64) func msghdr(t *kernel.Task, addr usermem.Addr, printContent bool, maxBytes uint64) string { var msg slinux.MessageHeader64 - if err := slinux.CopyInMessageHeader64(t, addr, &msg); err != nil { + if _, err := msg.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding msghdr: %v)", addr, err) } s := fmt.Sprintf( @@ -380,9 +381,9 @@ func postSockAddr(t *kernel.Task, addr usermem.Addr, lengthPtr usermem.Addr) str func copySockLen(t *kernel.Task, addr usermem.Addr) (uint32, error) { // socklen_t is 32-bits. - var l uint32 - _, err := t.CopyIn(addr, &l) - return l, err + var l primitive.Uint32 + _, err := l.CopyIn(t, addr) + return uint32(l), err } func sockLenPointer(t *kernel.Task, addr usermem.Addr) string { @@ -436,22 +437,22 @@ func getSockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, o func sockOptVal(t *kernel.Task, level, optname uint64, optVal usermem.Addr, optLen uint64, maximumBlobSize uint) string { switch optLen { case 1: - var v uint8 - _, err := t.CopyIn(optVal, &v) + var v primitive.Uint8 + _, err := v.CopyIn(t, optVal) if err != nil { return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) } return fmt.Sprintf("%#x {value=%v}", optVal, v) case 2: - var v uint16 - _, err := t.CopyIn(optVal, &v) + var v primitive.Uint16 + _, err := v.CopyIn(t, optVal) if err != nil { return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) } return fmt.Sprintf("%#x {value=%v}", optVal, v) case 4: - var v uint32 - _, err := t.CopyIn(optVal, &v) + var v primitive.Uint32 + _, err := v.CopyIn(t, optVal) if err != nil { return fmt.Sprintf("%#x {error reading optval: %v}", optVal, err) } @@ -521,6 +522,7 @@ var sockOptNames = map[uint64]abi.ValueSet{ linux.IP_ROUTER_ALERT: "IP_ROUTER_ALERT", linux.IP_PKTOPTIONS: "IP_PKTOPTIONS", linux.IP_MTU: "IP_MTU", + linux.SO_ORIGINAL_DST: "SO_ORIGINAL_DST", }, linux.SOL_SOCKET: { linux.SO_ERROR: "SO_ERROR", @@ -631,6 +633,8 @@ var sockOptNames = map[uint64]abi.ValueSet{ linux.IPV6_UNICAST_IF: "IPV6_UNICAST_IF", linux.MCAST_MSFILTER: "MCAST_MSFILTER", linux.IPV6_ADDRFORM: "IPV6_ADDRFORM", + linux.IP6T_SO_GET_INFO: "IP6T_SO_GET_INFO", + linux.IP6T_SO_GET_ENTRIES: "IP6T_SO_GET_ENTRIES", }, linux.SOL_NETLINK: { linux.NETLINK_BROADCAST_ERROR: "NETLINK_BROADCAST_ERROR", diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 68ca537c8..396744597 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -17,17 +17,16 @@ package strace import ( - "encoding/binary" "fmt" "strconv" "strings" - "syscall" "time" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -91,7 +90,7 @@ func iovecs(t *kernel.Task, addr usermem.Addr, iovcnt int, printContent bool, ma } b := make([]byte, size) - amt, err := t.CopyIn(ar.Start, b) + amt, err := t.CopyInBytes(ar.Start, b) if err != nil { iovs[i] = fmt.Sprintf("{base=%#x, len=%d, %q..., error decoding string: %v}", ar.Start, ar.Length(), b[:amt], err) continue @@ -118,7 +117,7 @@ func dump(t *kernel.Task, addr usermem.Addr, size uint, maximumBlobSize uint) st } b := make([]byte, size) - amt, err := t.CopyIn(addr, b) + amt, err := t.CopyInBytes(addr, b) if err != nil { return fmt.Sprintf("%#x (error decoding string: %s)", addr, err) } @@ -147,14 +146,14 @@ func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectory() var name string if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) name, _ = wd.FullName(root) } else { name = "(unknown cwd)" @@ -167,7 +166,7 @@ func fd(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := file.Dirent.FullName(root) return fmt.Sprintf("%#x %s", fd, name) @@ -175,12 +174,12 @@ func fd(t *kernel.Task, fd int32) string { func fdVFS2(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) vfsObj := root.Mount().Filesystem().VirtualFilesystem() if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, wd) return fmt.Sprintf("AT_FDCWD %s", name) @@ -191,7 +190,7 @@ func fdVFS2(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) return fmt.Sprintf("%#x %s", fd, name) @@ -199,7 +198,7 @@ func fdVFS2(t *kernel.Task, fd int32) string { func fdpair(t *kernel.Task, addr usermem.Addr) string { var fds [2]int32 - _, err := t.CopyIn(addr, &fds) + _, err := primitive.CopyInt32SliceIn(t, addr, fds[:]) if err != nil { return fmt.Sprintf("%#x (error decoding fds: %s)", addr, err) } @@ -209,7 +208,7 @@ func fdpair(t *kernel.Task, addr usermem.Addr) string { func uname(t *kernel.Task, addr usermem.Addr) string { var u linux.UtsName - if _, err := t.CopyIn(addr, &u); err != nil { + if _, err := u.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding utsname: %s)", addr, err) } @@ -222,7 +221,7 @@ func utimensTimespec(t *kernel.Task, addr usermem.Addr) string { } var tim linux.Timespec - if _, err := t.CopyIn(addr, &tim); err != nil { + if _, err := tim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err) } @@ -244,7 +243,7 @@ func timespec(t *kernel.Task, addr usermem.Addr) string { } var tim linux.Timespec - if _, err := t.CopyIn(addr, &tim); err != nil { + if _, err := tim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding timespec: %s)", addr, err) } return fmt.Sprintf("%#x {sec=%v nsec=%v}", addr, tim.Sec, tim.Nsec) @@ -256,7 +255,7 @@ func timeval(t *kernel.Task, addr usermem.Addr) string { } var tim linux.Timeval - if _, err := t.CopyIn(addr, &tim); err != nil { + if _, err := tim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding timeval: %s)", addr, err) } @@ -268,8 +267,8 @@ func utimbuf(t *kernel.Task, addr usermem.Addr) string { return "null" } - var utim syscall.Utimbuf - if _, err := t.CopyIn(addr, &utim); err != nil { + var utim linux.Utime + if _, err := utim.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding utimbuf: %s)", addr, err) } @@ -282,7 +281,7 @@ func stat(t *kernel.Task, addr usermem.Addr) string { } var stat linux.Stat - if _, err := t.CopyIn(addr, &stat); err != nil { + if _, err := stat.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding stat: %s)", addr, err) } return fmt.Sprintf("%#x {dev=%d, ino=%d, mode=%s, nlink=%d, uid=%d, gid=%d, rdev=%d, size=%d, blksize=%d, blocks=%d, atime=%s, mtime=%s, ctime=%s}", addr, stat.Dev, stat.Ino, linux.FileMode(stat.Mode), stat.Nlink, stat.UID, stat.GID, stat.Rdev, stat.Size, stat.Blksize, stat.Blocks, time.Unix(stat.ATime.Sec, stat.ATime.Nsec), time.Unix(stat.MTime.Sec, stat.MTime.Nsec), time.Unix(stat.CTime.Sec, stat.CTime.Nsec)) @@ -294,7 +293,7 @@ func itimerval(t *kernel.Task, addr usermem.Addr) string { } interval := timeval(t, addr) - value := timeval(t, addr+usermem.Addr(binary.Size(linux.Timeval{}))) + value := timeval(t, addr+usermem.Addr((*linux.Timeval)(nil).SizeBytes())) return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value) } @@ -304,7 +303,7 @@ func itimerspec(t *kernel.Task, addr usermem.Addr) string { } interval := timespec(t, addr) - value := timespec(t, addr+usermem.Addr(binary.Size(linux.Timespec{}))) + value := timespec(t, addr+usermem.Addr((*linux.Timespec)(nil).SizeBytes())) return fmt.Sprintf("%#x {interval=%s, value=%s}", addr, interval, value) } @@ -330,7 +329,7 @@ func rusage(t *kernel.Task, addr usermem.Addr) string { } var ru linux.Rusage - if _, err := t.CopyIn(addr, &ru); err != nil { + if _, err := ru.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding rusage: %s)", addr, err) } return fmt.Sprintf("%#x %+v", addr, ru) @@ -342,7 +341,7 @@ func capHeader(t *kernel.Task, addr usermem.Addr) string { } var hdr linux.CapUserHeader - if _, err := t.CopyIn(addr, &hdr); err != nil { + if _, err := hdr.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error decoding header: %s)", addr, err) } @@ -367,7 +366,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string { } var hdr linux.CapUserHeader - if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + if _, err := hdr.CopyIn(t, hdrAddr); err != nil { return fmt.Sprintf("%#x (error decoding header: %v)", dataAddr, err) } @@ -376,7 +375,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string { switch hdr.Version { case linux.LINUX_CAPABILITY_VERSION_1: var data linux.CapUserData - if _, err := t.CopyIn(dataAddr, &data); err != nil { + if _, err := data.CopyIn(t, dataAddr); err != nil { return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err) } p = uint64(data.Permitted) @@ -384,7 +383,7 @@ func capData(t *kernel.Task, hdrAddr, dataAddr usermem.Addr) string { e = uint64(data.Effective) case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: var data [2]linux.CapUserData - if _, err := t.CopyIn(dataAddr, &data); err != nil { + if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil { return fmt.Sprintf("%#x (error decoding data: %v)", dataAddr, err) } p = uint64(data[0].Permitted) | (uint64(data[1].Permitted) << 32) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index d9fb808c0..d23a0068a 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -28,7 +28,7 @@ import ( // CreateEpoll implements the epoll_create(2) linux syscall. func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, @@ -47,14 +47,14 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -73,14 +73,14 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -99,14 +99,14 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -115,7 +115,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { } // Try to remove the entry. - return e.RemoveEntry(epoll.FileIdentifier{file, fd}) + return e.RemoveEntry(t, epoll.FileIdentifier{file, fd}) } // WaitEpoll implements the epoll_wait(2) linux syscall. @@ -125,7 +125,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve if epollfile == nil { return nil, syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 217fcfef2..a2e441448 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -21,6 +21,7 @@ go_library( "sys_identity.go", "sys_inotify.go", "sys_lseek.go", + "sys_membarrier.go", "sys_mempolicy.go", "sys_mmap.go", "sys_mount.go", @@ -56,6 +57,7 @@ go_library( "sys_xattr.go", "timespec.go", ], + marshal = True, visibility = ["//:sandbox"], deps = [ "//pkg/abi", @@ -64,6 +66,8 @@ go_library( "//pkg/bpf", "//pkg/context", "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/metric", "//pkg/rand", "//pkg/safemem", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 64de56ac5..dab6207c0 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -36,8 +36,8 @@ var ( // errors, we may consume the error and return only the partial read/write. // // op and f are used only for panics. -func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op string, f *vfs.FileDescription) error { - known, err := handleIOErrorImpl(t, partialResult, err, intr, op) +func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, ioerr, intr error, op string, f *vfs.FileDescription) error { + known, err := handleIOErrorImpl(t, partialResult, ioerr, intr, op) if err != nil { return err } @@ -46,7 +46,7 @@ func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op s fs := f.Mount().Filesystem().VirtualFilesystem() root := vfs.RootFromContext(t) name, _ := fs.PathnameWithDeleted(t, root, f.VirtualDentry()) - log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, err, err, op, name) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, ioerr, ioerr, op, name) partialResultOnce.Do(partialResultMetric.Increment) } return nil @@ -56,15 +56,15 @@ func HandleIOErrorVFS2(t *kernel.Task, partialResult bool, err, intr error, op s // errors, we may consume the error and return only the partial read/write. // // op and f are used only for panics. -func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op string, f *fs.File) error { - known, err := handleIOErrorImpl(t, partialResult, err, intr, op) +func handleIOError(t *kernel.Task, partialResult bool, ioerr, intr error, op string, f *fs.File) error { + known, err := handleIOErrorImpl(t, partialResult, ioerr, intr, op) if err != nil { return err } if !known { // An unknown error is encountered with a partial read/write. name, _ := f.Dirent.FullName(nil /* ignore chroot */) - log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, err, err, op, name, f.FileOperations) + log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, ioerr, ioerr, op, name, f.FileOperations) partialResultOnce.Do(partialResultMetric.Increment) } return nil @@ -147,7 +147,7 @@ func handleIOErrorImpl(t *kernel.Task, partialResult bool, err, intr error, op s } switch err.(type) { - case kernel.SyscallRestartErrno: + case syserror.SyscallRestartErrno: // Identical to the EINTR case. return true, nil } diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index ea4f9b1a7..36902d177 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -118,7 +118,7 @@ var AMD64 = &kernel.SyscallTable{ 63: syscalls.Supported("uname", Uname), 64: syscalls.Supported("semget", Semget), 65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), - 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), 67: syscalls.Supported("shmdt", Shmdt), 68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) @@ -138,7 +138,7 @@ var AMD64 = &kernel.SyscallTable{ 83: syscalls.Supported("mkdir", Mkdir), 84: syscalls.Supported("rmdir", Rmdir), 85: syscalls.Supported("creat", Creat), - 86: syscalls.Supported("link", Link), + 86: syscalls.PartiallySupported("link", Link, "Limited support with Gofer. Link count and linked files may get out of sync because gVisor is not aware of external hardlinks.", nil), 87: syscalls.Supported("unlink", Unlink), 88: syscalls.Supported("symlink", Symlink), 89: syscalls.Supported("readlink", Readlink), @@ -305,9 +305,9 @@ var AMD64 = &kernel.SyscallTable{ 250: syscalls.Error("keyctl", syserror.EACCES, "Not available to user.", nil), 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) - 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil), - 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 253: syscalls.PartiallySupported("inotify_init", InotifyInit, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), + 254: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), + 255: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil), 257: syscalls.Supported("openat", Openat), 258: syscalls.Supported("mkdirat", Mkdirat), @@ -317,7 +317,7 @@ var AMD64 = &kernel.SyscallTable{ 262: syscalls.Supported("fstatat", Fstatat), 263: syscalls.Supported("unlinkat", Unlinkat), 264: syscalls.Supported("renameat", Renameat), - 265: syscalls.Supported("linkat", Linkat), + 265: syscalls.PartiallySupported("linkat", Linkat, "See link(2).", nil), 266: syscalls.Supported("symlinkat", Symlinkat), 267: syscalls.Supported("readlinkat", Readlinkat), 268: syscalls.Supported("fchmodat", Fchmodat), @@ -325,8 +325,8 @@ var AMD64 = &kernel.SyscallTable{ 270: syscalls.Supported("pselect", Pselect), 271: syscalls.Supported("ppoll", Ppoll), 272: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), - 273: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 274: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 273: syscalls.Supported("set_robust_list", SetRobustList), + 274: syscalls.Supported("get_robust_list", GetRobustList), 275: syscalls.Supported("splice", Splice), 276: syscalls.Supported("tee", Tee), 277: syscalls.PartiallySupported("sync_file_range", SyncFileRange, "Full data flush is not guaranteed at this time.", nil), @@ -346,7 +346,7 @@ var AMD64 = &kernel.SyscallTable{ 291: syscalls.Supported("epoll_create1", EpollCreate1), 292: syscalls.Supported("dup3", Dup3), 293: syscalls.Supported("pipe2", Pipe2), - 294: syscalls.Supported("inotify_init1", InotifyInit1), + 294: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), 295: syscalls.Supported("preadv", Preadv), 296: syscalls.Supported("pwritev", Pwritev), 297: syscalls.Supported("rt_tgsigqueueinfo", RtTgsigqueueinfo), @@ -376,7 +376,7 @@ var AMD64 = &kernel.SyscallTable{ 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 322: syscalls.Supported("execveat", Execveat), 323: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 324: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls implemented after 325 are "backports" from versions @@ -454,9 +454,9 @@ var ARM64 = &kernel.SyscallTable{ 23: syscalls.Supported("dup", Dup), 24: syscalls.Supported("dup3", Dup3), 25: syscalls.PartiallySupported("fcntl", Fcntl, "Not all options are supported.", nil), - 26: syscalls.Supported("inotify_init1", InotifyInit1), - 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil), - 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil), + 26: syscalls.PartiallySupported("inotify_init1", InotifyInit1, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), + 27: syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), + 28: syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "Inotify events are only available inside the sandbox. Hard links are treated as different watch targets in gofer fs.", nil), 29: syscalls.PartiallySupported("ioctl", Ioctl, "Only a few ioctls are implemented for backing devices and file systems.", nil), 30: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) 31: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending) @@ -527,8 +527,8 @@ var ARM64 = &kernel.SyscallTable{ 96: syscalls.Supported("set_tid_address", SetTidAddress), 97: syscalls.PartiallySupported("unshare", Unshare, "Mount, cgroup namespaces not supported. Network namespaces supported but must be empty.", nil), 98: syscalls.PartiallySupported("futex", Futex, "Robust futexes not supported.", nil), - 99: syscalls.Error("set_robust_list", syserror.ENOSYS, "Obsolete.", nil), - 100: syscalls.Error("get_robust_list", syserror.ENOSYS, "Obsolete.", nil), + 99: syscalls.Supported("set_robust_list", SetRobustList), + 100: syscalls.Supported("get_robust_list", GetRobustList), 101: syscalls.Supported("nanosleep", Nanosleep), 102: syscalls.Supported("getitimer", Getitimer), 103: syscalls.Supported("setitimer", Setitimer), @@ -619,7 +619,7 @@ var ARM64 = &kernel.SyscallTable{ 188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921) 190: syscalls.Supported("semget", Semget), - 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), + 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil), 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), 193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil), 194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil), @@ -695,7 +695,7 @@ var ARM64 = &kernel.SyscallTable{ 280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil), 281: syscalls.Supported("execveat", Execveat), 282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345) - 283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267) + 283: syscalls.PartiallySupported("membarrier", Membarrier, "Not supported on all platforms.", nil), 284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil), // Syscalls after 284 are "backports" from versions of Linux after 4.4. diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index d781d6a04..0bf313a13 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -15,8 +15,9 @@ package linux import ( - "encoding/binary" - + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,59 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// I/O commands. -const ( - _IOCB_CMD_PREAD = 0 - _IOCB_CMD_PWRITE = 1 - _IOCB_CMD_FSYNC = 2 - _IOCB_CMD_FDSYNC = 3 - _IOCB_CMD_NOOP = 6 - _IOCB_CMD_PREADV = 7 - _IOCB_CMD_PWRITEV = 8 -) - -// I/O flags. -const ( - _IOCB_FLAG_RESFD = 1 -) - -// ioCallback describes an I/O request. -// -// The priority field is currently ignored in the implementation below. Also -// note that the IOCB_FLAG_RESFD feature is not supported. -type ioCallback struct { - Data uint64 - Key uint32 - Reserved1 uint32 - - OpCode uint16 - ReqPrio int16 - FD int32 - - Buf uint64 - Bytes uint64 - Offset int64 - - Reserved2 uint64 - Flags uint32 - - // eventfd to signal if IOCB_FLAG_RESFD is set in flags. - ResFD int32 -} - -// ioEvent describes an I/O result. -// -// +stateify savable -type ioEvent struct { - Data uint64 - Obj uint64 - Result int64 - Result2 int64 -} - -// ioEventSize is the size of an ioEvent encoded. -var ioEventSize = binary.Size(ioEvent{}) - // IoSetup implements linux syscall io_setup(2). func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nrEvents := args[0].Int() @@ -89,7 +37,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // // The context pointer _must_ be zero initially. var idIn uint64 - if _, err := t.CopyIn(idAddr, &idIn); err != nil { + if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil { return 0, nil, err } if idIn != 0 { @@ -102,7 +50,7 @@ func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } // Copy out the new ID. - if _, err := t.CopyOut(idAddr, &id); err != nil { + if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil { t.MemoryManager().DestroyAIOContext(t, id) return 0, nil, err } @@ -192,10 +140,10 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } } - ev := v.(*ioEvent) + ev := v.(*linux.IOEvent) // Copy out the result. - if _, err := t.CopyOut(eventsAddr, ev); err != nil { + if _, err := ev.CopyOut(t, eventsAddr); err != nil { if count > 0 { return uintptr(count), nil, nil } @@ -204,7 +152,7 @@ func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } // Keep rolling. - eventsAddr += usermem.Addr(ioEventSize) + eventsAddr += usermem.Addr(linux.IOEventSize) } // Everything finished. @@ -231,7 +179,7 @@ func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadl } // memoryFor returns appropriate memory for the given callback. -func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { bytes := int(cb.Bytes) if bytes < 0 { // Linux also requires that this field fit in ssize_t. @@ -242,17 +190,17 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { // we have no guarantee that t's AddressSpace will be active during the // I/O. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PWRITE: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_PREADV, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ AddressSpaceActive: false, }) - case _IOCB_CMD_FSYNC, _IOCB_CMD_FDSYNC, _IOCB_CMD_NOOP: + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: return usermem.IOSequence{}, nil default: @@ -261,70 +209,78 @@ func memoryFor(t *kernel.Task, cb *ioCallback) (usermem.IOSequence, error) { } } -func performCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *ioCallback, ioseq usermem.IOSequence, ctx *mm.AIOContext, eventFile *fs.File) { - if ctx.Dead() { - ctx.CancelPendingRequest() - return - } - ev := &ioEvent{ - Data: cb.Data, - Obj: uint64(cbAddr), - } +// IoCancel implements linux syscall io_cancel(2). +// +// It is not presently supported (ENOSYS indicates no support on this +// architecture). +func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, syserror.ENOSYS +} - // Construct a context.Context that will not be interrupted if t is - // interrupted. - c := t.AsyncContext() +// LINT.IfChange - var err error - switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV: - ev.Result, err = file.Preadv(c, ioseq, cb.Offset) - case _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: - ev.Result, err = file.Pwritev(c, ioseq, cb.Offset) - case _IOCB_CMD_FSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncAll) - case _IOCB_CMD_FDSYNC: - err = file.Fsync(c, 0, fs.FileMaxOffset, fs.SyncData) - } +func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback { + return func(ctx context.Context) { + if actx.Dead() { + actx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } - // Update the result. - if err != nil { - err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) - ev.Result = -int64(kernel.ExtractErrno(err, 0)) - } + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset) + case linux.IOCB_CMD_FSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll) + case linux.IOCB_CMD_FDSYNC: + err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData) + } - file.DecRef() + // Update the result. + if err != nil { + err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } - // Queue the result for delivery. - ctx.FinishRequest(ev) + file.DecRef(ctx) - // Notify the event file if one was specified. This needs to happen - // *after* queueing the result to avoid racing with the thread we may - // wake up. - if eventFile != nil { - eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + // Queue the result for delivery. + actx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFile != nil { + eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) + eventFile.DecRef(ctx) + } } } // submitCallback processes a single callback. -func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Addr) error { +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { file := t.GetFile(cb.FD) if file == nil { // File not found. return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Was there an eventFD? Extract it. var eventFile *fs.File - if cb.Flags&_IOCB_FLAG_RESFD != 0 { + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { eventFile = t.GetFile(cb.ResFD) if eventFile == nil { // Bad FD. return syserror.EBADF } - defer eventFile.DecRef() + defer eventFile.DecRef(t) // Check that it is an eventfd. if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { @@ -340,7 +296,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Check offset for reads/writes. switch cb.OpCode { - case _IOCB_CMD_PREAD, _IOCB_CMD_PREADV, _IOCB_CMD_PWRITE, _IOCB_CMD_PWRITEV: + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: if cb.Offset < 0 { return syserror.EINVAL } @@ -366,7 +322,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *ioCallback, cbAddr usermem.Ad // Perform the request asynchronously. file.IncRef() - fs.Async(func() { performCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile) }) + t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile)) // All set. return nil @@ -383,21 +339,27 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } for i := int32(0); i < nrEvents; i++ { - // Copy in the address. - cbAddrNative := t.Arch().Native(0) - if _, err := t.CopyIn(addr, cbAddrNative); err != nil { - if i > 0 { - // Some successful. - return uintptr(i), nil, nil + // Copy in the callback address. + var cbAddr usermem.Addr + switch t.Arch().Width() { + case 8: + var cbAddrP primitive.Uint64 + if _, err := cbAddrP.CopyIn(t, addr); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err } - // Nothing done. - return 0, nil, err + cbAddr = usermem.Addr(cbAddrP) + default: + return 0, nil, syserror.ENOSYS } // Copy in this callback. - var cb ioCallback - cbAddr := usermem.Addr(t.Arch().Value(cbAddrNative)) - if _, err := t.CopyIn(cbAddr, &cb); err != nil { + var cb linux.IOCallback + if _, err := cb.CopyIn(t, cbAddr); err != nil { if i > 0 { // Some have been successful. @@ -424,10 +386,4 @@ func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return uintptr(nrEvents), nil, nil } -// IoCancel implements linux syscall io_cancel(2). -// -// It is not presently supported (ENOSYS indicates no support on this -// architecture). -func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ENOSYS -} +// LINT.ThenChange(vfs2/aio.go) diff --git a/pkg/sentry/syscalls/linux/sys_capability.go b/pkg/sentry/syscalls/linux/sys_capability.go index adf5ea5f2..d3b85e11b 100644 --- a/pkg/sentry/syscalls/linux/sys_capability.go +++ b/pkg/sentry/syscalls/linux/sys_capability.go @@ -45,7 +45,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal dataAddr := args[1].Pointer() var hdr linux.CapUserHeader - if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + if _, err := hdr.CopyIn(t, hdrAddr); err != nil { return 0, nil, err } // hdr.Pid doesn't need to be valid if this capget() is a "version probe" @@ -65,7 +65,7 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal Permitted: uint32(p), Inheritable: uint32(i), } - _, err = t.CopyOut(dataAddr, &data) + _, err = data.CopyOut(t, dataAddr) return 0, nil, err case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3: @@ -88,12 +88,12 @@ func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal Inheritable: uint32(i >> 32), }, } - _, err = t.CopyOut(dataAddr, &data) + _, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:]) return 0, nil, err default: hdr.Version = linux.HighestCapabilityVersion - if _, err := t.CopyOut(hdrAddr, &hdr); err != nil { + if _, err := hdr.CopyOut(t, hdrAddr); err != nil { return 0, nil, err } if dataAddr != 0 { @@ -109,7 +109,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal dataAddr := args[1].Pointer() var hdr linux.CapUserHeader - if _, err := t.CopyIn(hdrAddr, &hdr); err != nil { + if _, err := hdr.CopyIn(t, hdrAddr); err != nil { return 0, nil, err } switch hdr.Version { @@ -118,7 +118,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.EPERM } var data linux.CapUserData - if _, err := t.CopyIn(dataAddr, &data); err != nil { + if _, err := data.CopyIn(t, dataAddr); err != nil { return 0, nil, err } p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities @@ -131,7 +131,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, syserror.EPERM } var data [2]linux.CapUserData - if _, err := t.CopyIn(dataAddr, &data); err != nil { + if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil { return 0, nil, err } p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities @@ -141,7 +141,7 @@ func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal default: hdr.Version = linux.HighestCapabilityVersion - if _, err := t.CopyOut(hdrAddr, &hdr); err != nil { + if _, err := hdr.CopyOut(t, hdrAddr); err != nil { return 0, nil, err } return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index ed3413ca6..3b4f879e4 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -37,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc event.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.EFD_NONBLOCK != 0, }) - defer event.DecRef() + defer event.DecRef(t) fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 35a98212a..519066a47 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" @@ -40,7 +41,7 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, // Common case: we are accessing a file in the root. root := t.FSContext().RootDirectory() err := fn(root, root, name, linux.MaxSymlinkTraversals) - root.DecRef() + root.DecRef(t) return err } else if dir == "." && dirFD == linux.AT_FDCWD { // Common case: we are accessing a file relative to the current @@ -48,8 +49,8 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, wd := t.FSContext().WorkingDirectory() root := t.FSContext().RootDirectory() err := fn(root, wd, name, linux.MaxSymlinkTraversals) - wd.DecRef() - root.DecRef() + wd.DecRef(t) + root.DecRef(t) return err } @@ -83,6 +84,7 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro } rel = f.Dirent if !fs.IsDir(rel.Inode.StableAttr) { + f.DecRef(t) return syserror.ENOTDIR } } @@ -97,19 +99,19 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro } else { d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals) } - root.DecRef() + root.DecRef(t) if wd != nil { - wd.DecRef() + wd.DecRef(t) } if f != nil { - f.DecRef() + f.DecRef(t) } if err != nil { return err } err = fn(root, d, remainingTraversals) - d.DecRef() + d.DecRef(t) return err } @@ -184,9 +186,9 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint file, err := d.Inode.GetFile(t, d, fileFlags) if err != nil { - return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + return syserror.ConvertIntr(err, syserror.ERESTARTSYS) } - defer file.DecRef() + defer file.DecRef(t) // Success. newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ @@ -242,7 +244,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - file.DecRef() + file.DecRef(t) return nil case linux.ModeNamedPipe: @@ -332,7 +334,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer found.DecRef() + defer found.DecRef(t) // We found something (possibly a symlink). If the // O_EXCL flag was passed, then we can immediately @@ -357,7 +359,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l resolved, err = found.Inode.Getlink(t) if err == nil { // No more resolution necessary. - defer resolved.DecRef() + defer resolved.DecRef(t) break } if err != fs.ErrResolveViaReadlink { @@ -384,7 +386,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer newParent.DecRef() + defer newParent.DecRef(t) // Repeat the process with the parent and name of the // symlink target. @@ -414,9 +416,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // Create a new fs.File. newFile, err = found.Inode.GetFile(t, found, fileFlags) if err != nil { - return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + return syserror.ConvertIntr(err, syserror.ERESTARTSYS) } - defer newFile.DecRef() + defer newFile.DecRef(t) case syserror.ENOENT: // File does not exist. Proceed with creation. @@ -432,7 +434,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // No luck, bail. return err } - defer newFile.DecRef() + defer newFile.DecRef(t) found = newFile.Dirent default: return err @@ -596,24 +598,24 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Shared flags between file and socket. switch request { case linux.FIONCLEX: - t.FDTable().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(t, fd, kernel.FDFlags{ CloseOnExec: false, }) return 0, nil, nil case linux.FIOCLEX: - t.FDTable().SetFlags(fd, kernel.FDFlags{ + t.FDTable().SetFlags(t, fd, kernel.FDFlags{ CloseOnExec: true, }) return 0, nil, nil case linux.FIONBIO: var set int32 - if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { return 0, nil, err } flags := file.Flags() @@ -627,7 +629,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FIOASYNC: var set int32 - if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { return 0, nil, err } flags := file.Flags() @@ -641,15 +643,14 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.FIOSETOWN, linux.SIOCSPGRP: var set int32 - if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { return 0, nil, err } fSetOwn(t, file, set) return 0, nil, nil case linux.FIOGETOWN, linux.SIOCGPGRP: - who := fGetOwn(t, file) - _, err := t.CopyOut(args[2].Pointer(), &who) + _, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file)) return 0, nil, err default: @@ -671,9 +672,9 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() size := args[1].SizeT() cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Get our fullname from the root and preprend unreachable if the root was // unreachable from our current dirent this is the same behavior as on linux. @@ -694,7 +695,7 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } // Top it off with a terminator. - _, err = t.CopyOut(addr+usermem.Addr(bytes), []byte("\x00")) + _, err = t.CopyOutBytes(addr+usermem.Addr(bytes), []byte("\x00")) return uintptr(bytes + 1), nil, err } @@ -722,7 +723,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return err } - t.FSContext().SetRootDirectory(d) + t.FSContext().SetRootDirectory(t, d) return nil }) } @@ -747,7 +748,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return err } - t.FSContext().SetWorkingDirectory(d) + t.FSContext().SetWorkingDirectory(t, d) return nil }) } @@ -760,7 +761,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { @@ -772,7 +773,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - t.FSContext().SetWorkingDirectory(file.Dirent) + t.FSContext().SetWorkingDirectory(t, file.Dirent) return 0, nil, nil } @@ -787,11 +788,11 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that Remove provides a reference on the file that we may use to // flush. It is still active until we drop the final reference below // (and other reference-holding operations complete). - file, _ := t.FDTable().Remove(fd) + file, _ := t.FDTable().Remove(t, fd) if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Flush(t) return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -805,7 +806,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { @@ -826,7 +827,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) return uintptr(newfd), nil, nil } @@ -850,7 +851,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { @@ -900,14 +901,20 @@ func fGetOwn(t *kernel.Task, file *fs.File) int32 { // // If who is positive, it represents a PID. If negative, it represents a PGID. // If the PID or PGID is invalid, the owner is silently unset. -func fSetOwn(t *kernel.Task, file *fs.File, who int32) { +func fSetOwn(t *kernel.Task, file *fs.File, who int32) error { a := file.Async(fasync.New).(*fasync.FileAsync) if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return syserror.EINVAL + } pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who)) a.SetOwnerProcessGroup(t, pg) + } else { + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who)) + a.SetOwnerThreadGroup(t, tg) } - tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who)) - a.SetOwnerThreadGroup(t, tg) + return nil } // Fcntl implements linux syscall fcntl(2). @@ -919,7 +926,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -935,10 +942,10 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDTable().SetFlags(fd, kernel.FDFlags{ + err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) - return 0, nil, nil + return 0, nil, err case linux.F_GETFL: return uintptr(file.Flags().ToLinux()), nil, nil case linux.F_SETFL: @@ -956,7 +963,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Copy in the lock request. flockAddr := args[2].Pointer() var flock linux.Flock - if _, err := t.CopyIn(flockAddr, &flock); err != nil { + if _, err := flock.CopyIn(t, flockAddr); err != nil { return 0, nil, err } @@ -998,9 +1005,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } - // The lock uid is that of the Task's FDTable. - lockUniqueID := lock.UniqueID(t.FDTable().ID()) - // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { @@ -1010,12 +1014,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -1026,18 +1030,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } return 0, nil, nil case linux.F_UNLCK: - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng) return 0, nil, nil default: return 0, nil, syserror.EINVAL @@ -1045,17 +1049,16 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_GETOWN: return uintptr(fGetOwn(t, file)), nil, nil case linux.F_SETOWN: - fSetOwn(t, file, args[2].Int()) - return 0, nil, nil + return 0, nil, fSetOwn(t, file, args[2].Int()) case linux.F_GETOWN_EX: addr := args[2].Pointer() owner := fGetOwnEx(t, file) - _, err := t.CopyOut(addr, &owner) + _, err := owner.CopyOut(t, addr) return 0, nil, err case linux.F_SETOWN_EX: addr := args[2].Pointer() var owner linux.FOwnerEx - n, err := t.CopyIn(addr, &owner) + _, err := owner.CopyIn(t, addr) if err != nil { return 0, nil, err } @@ -1067,21 +1070,21 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ESRCH } a.SetOwnerTask(t, task) - return uintptr(n), nil, nil + return 0, nil, nil case linux.F_OWNER_PID: tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID)) if tg == nil { return 0, nil, syserror.ESRCH } a.SetOwnerThreadGroup(t, tg) - return uintptr(n), nil, nil + return 0, nil, nil case linux.F_OWNER_PGRP: pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID)) if pg == nil { return 0, nil, syserror.ESRCH } a.SetOwnerProcessGroup(t, pg) - return uintptr(n), nil, nil + return 0, nil, nil default: return 0, nil, syserror.EINVAL } @@ -1114,17 +1117,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } -// LINT.ThenChange(vfs2/fd.go) - -const ( - _FADV_NORMAL = 0 - _FADV_RANDOM = 1 - _FADV_SEQUENTIAL = 2 - _FADV_WILLNEED = 3 - _FADV_DONTNEED = 4 - _FADV_NOREUSE = 5 -) - // Fadvise64 implements linux syscall fadvise64(2). // This implementation currently ignores the provided advice. func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { @@ -1141,7 +1133,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { @@ -1149,12 +1141,12 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } switch advice { - case _FADV_NORMAL: - case _FADV_RANDOM: - case _FADV_SEQUENTIAL: - case _FADV_WILLNEED: - case _FADV_DONTNEED: - case _FADV_NOREUSE: + case linux.POSIX_FADV_NORMAL: + case linux.POSIX_FADV_RANDOM: + case linux.POSIX_FADV_SEQUENTIAL: + case linux.POSIX_FADV_WILLNEED: + case linux.POSIX_FADV_DONTNEED: + case linux.POSIX_FADV_NOREUSE: default: return 0, nil, syserror.EINVAL } @@ -1163,6 +1155,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.ThenChange(vfs2/fd.go) + // LINT.IfChange func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { @@ -1182,7 +1176,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode switch err { case nil: // The directory existed. - defer f.DecRef() + defer f.DecRef(t) return syserror.EEXIST case syserror.EACCES: // Permission denied while walking to the directory. @@ -1360,7 +1354,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32 if target == nil { return syserror.EBADF } - defer target.DecRef() + defer target.DecRef(t) if err := mayLinkAt(t, target.Dirent.Inode); err != nil { return err } @@ -1613,7 +1607,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. @@ -1741,7 +1735,7 @@ func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bo if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return chown(t, file.Dirent, uid, gid) } @@ -1779,7 +1773,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chown(t, file.Dirent, uid, gid) } @@ -1844,7 +1838,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chmod(t, file.Dirent, mode) } @@ -1904,10 +1898,10 @@ func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, reso if f == nil { return syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals) } @@ -1929,7 +1923,7 @@ func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times linux.Utime - if _, err := t.CopyIn(timesAddr, ×); err != nil { + if _, err := times.CopyIn(t, timesAddr); err != nil { return 0, nil, err } ts = fs.TimeSpec{ @@ -1949,7 +1943,7 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times [2]linux.Timeval - if _, err := t.CopyIn(timesAddr, ×); err != nil { + if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil { return 0, nil, err } ts = fs.TimeSpec{ @@ -1977,7 +1971,7 @@ func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times [2]linux.Timespec - if _, err := t.CopyIn(timesAddr, ×); err != nil { + if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil { return 0, nil, err } if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) { @@ -2011,7 +2005,7 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys ts := defaultSetToSystemTimeSpec() if timesAddr != 0 { var times [2]linux.Timeval - if _, err := t.CopyIn(timesAddr, ×); err != nil { + if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil { return 0, nil, err } if times[0].Usec >= 1e6 || times[0].Usec < 0 || @@ -2099,7 +2093,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL @@ -2152,27 +2146,11 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB - // flock(2): - // Locks created by flock() are associated with an open file table entry. This means that - // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the - // same lock, and this lock may be modified or released using any of these descriptors. Furthermore, - // the lock is released either by an explicit LOCK_UN operation on any of these duplicate - // descriptors, or when all such descriptors have been closed. - // - // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file, - // these descriptors are treated independently by flock(). An attempt to lock the file using - // one of these file descriptors may be denied by a lock that the calling process has already placed via - // another descriptor. - // - // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2) - // and fork(2). - lockUniqueID := lock.UniqueID(file.UniqueID) - // A BSD style lock spans the entire file. rng := lock.LockRange{ Start: 0, @@ -2183,29 +2161,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.LOCK_EX: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_UN: - file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng) + file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng) default: // flock(2): EINVAL operation is invalid. return 0, nil, syserror.EINVAL @@ -2251,8 +2229,8 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - defer dirent.DecRef() - defer file.DecRef() + defer dirent.DecRef(t) + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: cloExec, diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index b68261f72..f39ce0639 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -73,8 +73,8 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) } - t.Futex().WaitComplete(w) - return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + t.Futex().WaitComplete(w, t) + return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS) } // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is @@ -95,7 +95,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add } remaining, err := t.BlockWithTimeout(w.C, !forever, duration) - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) if err == nil { return 0, nil } @@ -110,7 +110,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add // The wait duration was absolute, restart with the original arguments. if forever { - return 0, kernel.ERESTARTSYS + return 0, syserror.ERESTARTSYS } // The wait duration was relative, restart with the remaining duration. @@ -121,7 +121,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add val: val, mask: mask, }) - return 0, kernel.ERESTART_RESTARTBLOCK + return 0, syserror.ERESTART_RESTARTBLOCK } func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.Addr, private bool) error { @@ -148,8 +148,8 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A timer.Destroy() } - t.Futex().WaitComplete(w) - return syserror.ConvertIntr(err, kernel.ERESTARTSYS) + t.Futex().WaitComplete(w, t) + return syserror.ConvertIntr(err, syserror.ERESTARTSYS) } func tryLockPI(t *kernel.Task, addr usermem.Addr, private bool) error { @@ -198,7 +198,7 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall switch cmd { case linux.FUTEX_WAIT: // WAIT uses a relative timeout. - mask = ^uint32(0) + mask = linux.FUTEX_BITSET_MATCH_ANY var timeoutDur time.Duration if !forever { timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond @@ -286,3 +286,53 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.ENOSYS } } + +// SetRobustList implements linux syscall set_robust_list(2). +func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + head := args[0].Pointer() + length := args[1].SizeT() + + if length != uint(linux.SizeOfRobustListHead) { + return 0, nil, syserror.EINVAL + } + t.SetRobustList(head) + return 0, nil, nil +} + +// GetRobustList implements linux syscall get_robust_list(2). +func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // Despite the syscall using the name 'pid' for this variable, it is + // very much a tid. + tid := args[0].Int() + headAddr := args[1].Pointer() + sizeAddr := args[2].Pointer() + + if tid < 0 { + return 0, nil, syserror.EINVAL + } + + ot := t + if tid != 0 { + if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { + return 0, nil, syserror.ESRCH + } + } + + // Copy out head pointer. + head := t.Arch().Native(uintptr(ot.GetRobustList())) + if _, err := head.CopyOut(t, headAddr); err != nil { + return 0, nil, err + } + + // Copy out size, which is a constant. Note that while size isn't + // an address, it is defined as the arch-dependent size_t, so it + // needs to be converted to a native-sized int. + size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead)) + if _, err := size.CopyOut(t, sizeAddr); err != nil { + return 0, nil, err + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index b126fecc0..b25f7d881 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -19,7 +19,6 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -68,7 +67,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir if dir == nil { return 0, syserror.EBADF } - defer dir.DecRef() + defer dir.DecRef(t) w := &usermem.IOReadWriter{ Ctx: t, @@ -82,7 +81,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir ds := newDirentSerializer(f, w, t.Arch(), size) rerr := dir.Readdir(t, ds) - switch err := handleIOError(t, ds.Written() > 0, rerr, kernel.ERESTARTSYS, "getdents", dir); err { + switch err := handleIOError(t, ds.Written() > 0, rerr, syserror.ERESTARTSYS, "getdents", dir); err { case nil: dir.Dirent.InotifyEvent(linux.IN_ACCESS, 0) return uintptr(ds.Written()), nil @@ -93,19 +92,23 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir } } -// oldDirentHdr is a fixed sized header matching the fixed size -// fields found in the old linux dirent struct. +// oldDirentHdr is a fixed sized header matching the fixed size fields found in +// the old linux dirent struct. +// +// +marshal type oldDirentHdr struct { Ino uint64 Off uint64 - Reclen uint16 + Reclen uint16 `marshal:"unaligned"` // Struct ends mid-word. } -// direntHdr is a fixed sized header matching the fixed size -// fields found in the new linux dirent struct. +// direntHdr is a fixed sized header matching the fixed size fields found in the +// new linux dirent struct. +// +// +marshal type direntHdr struct { OldHdr oldDirentHdr - Typ uint8 + Typ uint8 `marshal:"unaligned"` // Struct ends mid-word. } // dirent contains the data pointed to by a new linux dirent struct. @@ -134,20 +137,20 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent // the old linux dirent format. func smallestDirent(a arch.Context) uint { d := dirent{} - return uint(binary.Size(d.Hdr.OldHdr)) + a.Width() + 1 + return uint(d.Hdr.OldHdr.SizeBytes()) + a.Width() + 1 } // smallestDirent64 returns the size of the smallest possible dirent using // the new linux dirent format. func smallestDirent64(a arch.Context) uint { d := dirent{} - return uint(binary.Size(d.Hdr)) + a.Width() + return uint(d.Hdr.SizeBytes()) + a.Width() } // padRec pads the name field until the rec length is a multiple of the width, // which must be a power of 2. It returns the padded rec length. func (d *dirent) padRec(width int) uint16 { - a := int(binary.Size(d.Hdr)) + len(d.Name) + a := d.Hdr.SizeBytes() + len(d.Name) r := (a + width) &^ (width - 1) padding := r - a d.Name = append(d.Name, make([]byte, padding)...) @@ -157,7 +160,7 @@ func (d *dirent) padRec(width int) uint16 { // Serialize64 serializes a Dirent struct to a byte slice, keeping the new // linux dirent format. Returns the number of bytes serialized or an error. func (d *dirent) Serialize64(w io.Writer) (int, error) { - n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr)) + n1, err := d.Hdr.WriteTo(w) if err != nil { return 0, err } @@ -165,14 +168,14 @@ func (d *dirent) Serialize64(w io.Writer) (int, error) { if err != nil { return 0, err } - return n1 + n2, nil + return int(n1) + n2, nil } // Serialize serializes a Dirent struct to a byte slice, using the old linux // dirent format. // Returns the number of bytes serialized or an error. func (d *dirent) Serialize(w io.Writer) (int, error) { - n1, err := w.Write(binary.Marshal(nil, usermem.ByteOrder, d.Hdr.OldHdr)) + n1, err := d.Hdr.OldHdr.WriteTo(w) if err != nil { return 0, err } @@ -184,7 +187,7 @@ func (d *dirent) Serialize(w io.Writer) (int, error) { if err != nil { return 0, err } - return n1 + n2 + n3, nil + return int(n1) + n2 + n3, nil } // direntSerializer implements fs.InodeOperationsInfoSerializer, serializing dirents to an diff --git a/pkg/sentry/syscalls/linux/sys_identity.go b/pkg/sentry/syscalls/linux/sys_identity.go index 715ac45e6..a29d307e5 100644 --- a/pkg/sentry/syscalls/linux/sys_identity.go +++ b/pkg/sentry/syscalls/linux/sys_identity.go @@ -49,13 +49,13 @@ func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys ruid := c.RealKUID.In(c.UserNamespace).OrOverflow() euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow() suid := c.SavedKUID.In(c.UserNamespace).OrOverflow() - if _, err := t.CopyOut(ruidAddr, ruid); err != nil { + if _, err := ruid.CopyOut(t, ruidAddr); err != nil { return 0, nil, err } - if _, err := t.CopyOut(euidAddr, euid); err != nil { + if _, err := euid.CopyOut(t, euidAddr); err != nil { return 0, nil, err } - if _, err := t.CopyOut(suidAddr, suid); err != nil { + if _, err := suid.CopyOut(t, suidAddr); err != nil { return 0, nil, err } return 0, nil, nil @@ -84,13 +84,13 @@ func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys rgid := c.RealKGID.In(c.UserNamespace).OrOverflow() egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow() sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow() - if _, err := t.CopyOut(rgidAddr, rgid); err != nil { + if _, err := rgid.CopyOut(t, rgidAddr); err != nil { return 0, nil, err } - if _, err := t.CopyOut(egidAddr, egid); err != nil { + if _, err := egid.CopyOut(t, egidAddr); err != nil { return 0, nil, err } - if _, err := t.CopyOut(sgidAddr, sgid); err != nil { + if _, err := sgid.CopyOut(t, sgidAddr); err != nil { return 0, nil, err } return 0, nil, nil @@ -157,7 +157,7 @@ func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys for i, kgid := range kgids { gids[i] = kgid.In(t.UserNamespace()).OrOverflow() } - if _, err := t.CopyOut(args[1].Pointer(), gids); err != nil { + if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil { return 0, nil, err } return uintptr(len(gids)), nil, nil @@ -173,7 +173,7 @@ func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, t.SetExtraGIDs(nil) } gids := make([]auth.GID, size) - if _, err := t.CopyIn(args[1].Pointer(), &gids); err != nil { + if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil { return 0, nil, err } return 0, nil, t.SetExtraGIDs(gids) diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index b2c7b3444..cf47bb9dd 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -40,7 +40,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. NonBlocking: flags&linux.IN_NONBLOCK != 0, } n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) - defer n.DecRef() + defer n.DecRef(t) fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -71,7 +71,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. - file.DecRef() + file.DecRef(t) return nil, nil, syserror.EINVAL } @@ -98,7 +98,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -128,6 +128,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer file.DecRef() - return 0, nil, ino.RmWatch(wd) + defer file.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 3f7691eae..0046347cb 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -33,7 +33,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var sw fs.SeekWhence switch whence { @@ -48,7 +48,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } offset, serr := file.Seek(t, sw, offset) - err := handleIOError(t, false /* partialResult */, serr, kernel.ERESTARTSYS, "lseek", file) + err := handleIOError(t, false /* partialResult */, serr, syserror.ERESTARTSYS, "lseek", file) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_membarrier.go b/pkg/sentry/syscalls/linux/sys_membarrier.go new file mode 100644 index 000000000..63ee5d435 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_membarrier.go @@ -0,0 +1,103 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Membarrier implements syscall membarrier(2). +func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + cmd := args[0].Int() + flags := args[1].Uint() + + switch cmd { + case linux.MEMBARRIER_CMD_QUERY: + if flags != 0 { + return 0, nil, syserror.EINVAL + } + var supportedCommands uintptr + if t.Kernel().Platform.HaveGlobalMemoryBarrier() { + supportedCommands |= linux.MEMBARRIER_CMD_GLOBAL | + linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED | + linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | + linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | + linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED + } + if t.RSeqAvailable() { + supportedCommands |= linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ | + linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ + } + return supportedCommands, nil, nil + case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED: + if flags != 0 { + return 0, nil, syserror.EINVAL + } + if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { + return 0, nil, syserror.EINVAL + } + if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() { + return 0, nil, syserror.EPERM + } + return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier() + case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: + if flags != 0 { + return 0, nil, syserror.EINVAL + } + if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { + return 0, nil, syserror.EINVAL + } + // no-op + return 0, nil, nil + case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: + if flags != 0 { + return 0, nil, syserror.EINVAL + } + if !t.Kernel().Platform.HaveGlobalMemoryBarrier() { + return 0, nil, syserror.EINVAL + } + t.MemoryManager().EnableMembarrierPrivate() + return 0, nil, nil + case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 { + return 0, nil, syserror.EINVAL + } + if !t.RSeqAvailable() { + return 0, nil, syserror.EINVAL + } + if !t.MemoryManager().IsMembarrierRSeqEnabled() { + return 0, nil, syserror.EPERM + } + // MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have + // the ability to preempt specific CPUs. + return 0, nil, t.Kernel().Platform.PreemptAllCPUs() + case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + if flags != 0 { + return 0, nil, syserror.EINVAL + } + if !t.RSeqAvailable() { + return 0, nil, syserror.EINVAL + } + t.MemoryManager().EnableMembarrierRSeq() + return 0, nil, nil + default: + // Probably a command we don't implement. + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 91694d374..cd8dfdfa4 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -75,7 +75,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -85,7 +85,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) flags := file.Flags() // mmap unconditionally requires that the FD is readable. @@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } + } else if shared { + // Back shared anonymous mappings with a special mappable. + opts.Offset = 0 + m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel()) + if err != nil { + return 0, nil, err + } + opts.MappingIdentity = m // transfers ownership of m to opts + opts.Mappable = m } rv, err := t.MemoryManager().MMap(t, opts) @@ -239,7 +248,7 @@ func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca return 0, nil, syserror.ENOMEM } resident := bytes.Repeat([]byte{1}, int(mapped/usermem.PageSize)) - _, err := t.CopyOut(vec, resident) + _, err := t.CopyOutBytes(vec, resident) return 0, nil, err } @@ -267,7 +276,7 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }) // MSync calls fsync, the same interrupt conversion rules apply, see // mm/msync.c, fsync POSIX.1-2008. - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) } // Mlock implements linux syscall mlock(2). diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index eb5ff48f5..bd0633564 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -115,7 +115,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }); err != nil { // Something went wrong. Drop our ref on rootInode before // returning the error. - rootInode.DecRef() + rootInode.DecRef(t) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 43c510930..849a47476 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -34,10 +35,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) r.SetFlags(linuxToFlags(flags).Settable()) - defer r.DecRef() + defer r.DecRef(t) w.SetFlags(linuxToFlags(flags).Settable()) - defer w.DecRef() + defer w.DecRef(t) fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -46,10 +47,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { return 0, err } - if _, err := t.CopyOut(addr, fds); err != nil { + if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil { for _, fd := range fds { - if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + if file, _ := t.FDTable().Remove(t, fd); file != nil { + file.DecRef(t) } } return 0, err diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index f0198141c..254f4c9f9 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -70,7 +70,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -82,11 +82,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -107,7 +107,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -162,7 +162,7 @@ func CopyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD pfd := make([]linux.PollFD, nfds) if nfds > 0 { - if _, err := t.CopyIn(addr, &pfd); err != nil { + if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil { return nil, err } } @@ -189,7 +189,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. if nfds > 0 && err == nil { - if _, err := t.CopyOut(addr, pfd); err != nil { + if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil { return remainingTimeout, 0, err } } @@ -202,7 +202,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy set := make([]byte, nBytes) if addr != 0 { - if _, err := t.CopyIn(addr, &set); err != nil { + if _, err := t.CopyInBytes(addr, set); err != nil { return nil, err } // If we only use part of the last byte, mask out the extraneous bits. @@ -266,7 +266,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { @@ -329,19 +329,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // Copy updated vectors back. if readFDs != 0 { - if _, err := t.CopyOut(readFDs, r); err != nil { + if _, err := t.CopyOutBytes(readFDs, r); err != nil { return 0, err } } if writeFDs != 0 { - if _, err := t.CopyOut(writeFDs, w); err != nil { + if _, err := t.CopyOutBytes(writeFDs, w); err != nil { return 0, err } } if exceptFDs != 0 { - if _, err := t.CopyOut(exceptFDs, e); err != nil { + if _, err := t.CopyOutBytes(exceptFDs, e); err != nil { return 0, err } } @@ -410,7 +410,7 @@ func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration nfds: nfds, timeout: remainingTimeout, }) - return 0, kernel.ERESTART_RESTARTBLOCK + return 0, syserror.ERESTART_RESTARTBLOCK } return n, err } @@ -464,7 +464,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. if err == syserror.EINTR && copyErr == nil { - err = kernel.ERESTARTNOHAND + err = syserror.ERESTARTNOHAND } return n, nil, err } @@ -494,7 +494,7 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. if err == syserror.EINTR && copyErr == nil { - err = kernel.ERESTARTNOHAND + err = syserror.ERESTARTNOHAND } return n, nil, err } @@ -539,7 +539,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. if err == syserror.EINTR && copyErr == nil { - err = kernel.ERESTARTNOHAND + err = syserror.ERESTARTNOHAND } return n, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index f92bf8096..a892d2c62 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsbridge" @@ -43,7 +44,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, nil case linux.PR_GET_PDEATHSIG: - _, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal())) + _, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal())) return 0, nil, err case linux.PR_GET_DUMPABLE: @@ -110,7 +111,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall buf[len] = 0 len++ } - _, err := t.CopyOut(addr, buf[:len]) + _, err := t.CopyOutBytes(addr, buf[:len]) if err != nil { return 0, nil, err } @@ -128,7 +129,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { @@ -136,7 +137,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) + t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 071b4bacc..f655d3db1 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -48,7 +48,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -71,7 +71,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "read", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "read", file) } // Readahead implements readahead(2). @@ -84,7 +84,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -118,7 +118,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) } // Readv implements linux syscall readv(2). @@ -164,7 +164,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -181,7 +181,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) } // Preadv implements linux syscall preadv(2). @@ -195,7 +195,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -222,7 +222,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) } // Preadv2 implements linux syscall preadv2(2). @@ -244,7 +244,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -280,12 +280,12 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if offset == -1 { n, err := readv(t, file, dst) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) } n, err := preadv(t, file, dst, offset) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) } func readv(t *kernel.Task, f *fs.File, dst usermem.IOSequence) (int64, error) { diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index d5d5b6959..309c183a3 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -26,17 +27,13 @@ import ( // rlimit describes an implementation of 'struct rlimit', which may vary from // system-to-system. type rlimit interface { + marshal.Marshallable + // toLimit converts an rlimit to a limits.Limit. toLimit() *limits.Limit // fromLimit converts a limits.Limit to an rlimit. fromLimit(lim limits.Limit) - - // copyIn copies an rlimit from the untrusted app to the kernel. - copyIn(t *kernel.Task, addr usermem.Addr) error - - // copyOut copies an rlimit from the kernel to the untrusted app. - copyOut(t *kernel.Task, addr usermem.Addr) error } // newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system. @@ -50,6 +47,7 @@ func newRlimit(t *kernel.Task) (rlimit, error) { } } +// +marshal type rlimit64 struct { Cur uint64 Max uint64 @@ -70,12 +68,12 @@ func (r *rlimit64) fromLimit(lim limits.Limit) { } func (r *rlimit64) copyIn(t *kernel.Task, addr usermem.Addr) error { - _, err := t.CopyIn(addr, r) + _, err := r.CopyIn(t, addr) return err } func (r *rlimit64) copyOut(t *kernel.Task, addr usermem.Addr) error { - _, err := t.CopyOut(addr, *r) + _, err := r.CopyOut(t, addr) return err } @@ -140,7 +138,8 @@ func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } rlim.fromLimit(lim) - return 0, nil, rlim.copyOut(t, addr) + _, err = rlim.CopyOut(t, addr) + return 0, nil, err } // Setrlimit implements linux syscall setrlimit(2). @@ -155,7 +154,7 @@ func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if err != nil { return 0, nil, err } - if err := rlim.copyIn(t, addr); err != nil { + if _, err := rlim.CopyIn(t, addr); err != nil { return 0, nil, syserror.EFAULT } _, err = prlimit64(t, resource, rlim.toLimit()) diff --git a/pkg/sentry/syscalls/linux/sys_rusage.go b/pkg/sentry/syscalls/linux/sys_rusage.go index 1674c7445..ac5c98a54 100644 --- a/pkg/sentry/syscalls/linux/sys_rusage.go +++ b/pkg/sentry/syscalls/linux/sys_rusage.go @@ -80,7 +80,7 @@ func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } ru := getrusage(t, which) - _, err := t.CopyOut(addr, &ru) + _, err := ru.CopyOut(t, addr) return 0, nil, err } @@ -104,7 +104,7 @@ func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall CUTime: linux.ClockTFromDuration(cs2.UserTime), CSTime: linux.ClockTFromDuration(cs2.SysTime), } - if _, err := t.CopyOut(addr, &r); err != nil { + if _, err := r.CopyOut(t, addr); err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go index 99f6993f5..bfcf44b6f 100644 --- a/pkg/sentry/syscalls/linux/sys_sched.go +++ b/pkg/sentry/syscalls/linux/sys_sched.go @@ -27,8 +27,10 @@ const ( ) // SchedParam replicates struct sched_param in sched.h. +// +// +marshal type SchedParam struct { - schedPriority int64 + schedPriority int32 } // SchedGetparam implements linux syscall sched_getparam(2). @@ -45,7 +47,7 @@ func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.ESRCH } r := SchedParam{schedPriority: onlyPriority} - if _, err := t.CopyOut(param, r); err != nil { + if _, err := r.CopyOut(t, param); err != nil { return 0, nil, err } @@ -79,7 +81,7 @@ func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ke return 0, nil, syserror.ESRCH } var r SchedParam - if _, err := t.CopyIn(param, &r); err != nil { + if _, err := r.CopyIn(t, param); err != nil { return 0, nil, syserror.EINVAL } if r.schedPriority != onlyPriority { diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index 5b7a66f4d..4fdb4463c 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -24,6 +24,8 @@ import ( ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. +// +// +marshal type userSockFprog struct { // Len is the length of the filter in BPF instructions. Len uint16 @@ -33,7 +35,7 @@ type userSockFprog struct { // Filter is a user pointer to the struct sock_filter array that makes up // the filter program. Filter is a uint64 rather than a usermem.Addr // because usermem.Addr is actually uintptr, which is not a fixed-size - // type, and encoding/binary.Read objects to this. + // type. Filter uint64 } @@ -54,11 +56,11 @@ func seccomp(t *kernel.Task, mode, flags uint64, addr usermem.Addr) error { } var fprog userSockFprog - if _, err := t.CopyIn(addr, &fprog); err != nil { + if _, err := fprog.CopyIn(t, addr); err != nil { return err } filter := make([]linux.BPFInstruction, int(fprog.Len)) - if _, err := t.CopyIn(usermem.Addr(fprog.Filter), &filter); err != nil { + if _, err := linux.CopyBPFInstructionSliceIn(t, usermem.Addr(fprog.Filter), filter); err != nil { return err } compiledFilter, err := bpf.Compile(filter) diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index 5f54f2456..c2d4bf805 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -66,7 +67,7 @@ func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } ops := make([]linux.Sembuf, nsops) - if _, err := t.CopyIn(sembufAddr, ops); err != nil { + if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil { return 0, nil, err } @@ -116,8 +117,8 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal case linux.IPC_SET: arg := args[3].Pointer() - s := linux.SemidDS{} - if _, err := t.CopyIn(arg, &s); err != nil { + var s linux.SemidDS + if _, err := s.CopyIn(t, arg); err != nil { return 0, nil, err } @@ -128,9 +129,17 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal v, err := getPID(t, id, num) return uintptr(v), nil, err + case linux.IPC_STAT: + arg := args[3].Pointer() + ds, err := ipcStat(t, id) + if err == nil { + _, err = ds.CopyOut(t, arg) + } + + return 0, nil, err + case linux.IPC_INFO, linux.SEM_INFO, - linux.IPC_STAT, linux.SEM_STAT, linux.SEM_STAT_ANY, linux.GETNCNT, @@ -170,6 +179,16 @@ func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FileP return set.Change(t, creds, owner, perms) } +func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) { + r := t.IPCNamespace().SemaphoreRegistry() + set := r.FindByID(id) + if set == nil { + return nil, syserror.EINVAL + } + creds := auth.CredentialsFromContext(t) + return set.GetStat(creds) +} + func setVal(t *kernel.Task, id int32, num int32, val int16) error { r := t.IPCNamespace().SemaphoreRegistry() set := r.FindByID(id) @@ -188,7 +207,7 @@ func setValAll(t *kernel.Task, id int32, array usermem.Addr) error { return syserror.EINVAL } vals := make([]uint16, set.Size()) - if _, err := t.CopyIn(array, vals); err != nil { + if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil { return err } creds := auth.CredentialsFromContext(t) @@ -217,7 +236,7 @@ func getValAll(t *kernel.Task, id int32, array usermem.Addr) error { if err != nil { return err } - _, err = t.CopyOut(array, vals) + _, err = primitive.CopyUint16SliceOut(t, array, vals) return err } diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index 4a8bc24a2..584064143 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -39,7 +39,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer segment.DecRef() + defer segment.DecRef(t) return uintptr(segment.ID), nil, nil } @@ -66,7 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, @@ -108,22 +108,22 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) stat, err := segment.IPCStat(t) if err == nil { - _, err = t.CopyOut(buf, stat) + _, err = stat.CopyOut(t, buf) } return 0, nil, err case linux.IPC_INFO: params := r.IPCInfo() - _, err := t.CopyOut(buf, params) + _, err := params.CopyOut(t, buf) return 0, nil, err case linux.SHM_INFO: info := r.ShmInfo() - _, err := t.CopyOut(buf, info) + _, err := info.CopyOut(t, buf) return 0, nil, err } @@ -132,20 +132,19 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) switch cmd { case linux.IPC_SET: var ds linux.ShmidDS - _, err = t.CopyIn(buf, &ds) - if err != nil { + if _, err = ds.CopyIn(t, buf); err != nil { return 0, nil, err } - err = segment.Set(t, &ds) + err := segment.Set(t, &ds) return 0, nil, err case linux.IPC_RMID: - segment.MarkDestroyed() + segment.MarkDestroyed(t) return 0, nil, nil case linux.SHM_LOCK, linux.SHM_UNLOCK: diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index d2b0012ae..e748d33d8 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -348,7 +348,7 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S // Pause implements linux syscall pause(2). func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) + return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND) } // RtSigpending implements linux syscall rt_sigpending(2). @@ -496,7 +496,7 @@ func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. t.SetSavedSignalMask(oldmask) // Perform the wait. - return 0, nil, syserror.ConvertIntr(t.Block(nil), kernel.ERESTARTNOHAND) + return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND) } // RestartSyscall implements the linux syscall restart_syscall(2). @@ -536,7 +536,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { @@ -553,7 +553,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Set appropriate flags. file.SetFlags(fs.SettableFileFlags{ diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 0760af77b..9cd052c3d 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -19,6 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -65,10 +67,10 @@ const flagsOffset = 48 const sizeOfInt32 = 4 // messageHeader64Len is the length of a MessageHeader64 struct. -var messageHeader64Len = uint64(binary.Size(MessageHeader64{})) +var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes()) // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. -var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{})) +var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes()) // baseRecvFlags are the flags that are accepted across recvmsg(2), // recvmmsg(2), and recvfrom(2). @@ -76,6 +78,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | // MessageHeader64 is the 64-bit representation of the msghdr struct used in // the recvmsg and sendmsg syscalls. +// +// +marshal type MessageHeader64 struct { // Name is the optional pointer to a network address buffer. Name uint64 @@ -104,30 +108,14 @@ type MessageHeader64 struct { // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in // the recvmmsg and sendmmsg syscalls. +// +// +marshal type multipleMessageHeader64 struct { msgHdr MessageHeader64 msgLen uint32 _ int32 } -// CopyInMessageHeader64 copies a message header from user to kernel memory. -func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error { - b := t.CopyScratchBuffer(52) - if _, err := t.CopyInBytes(addr, b); err != nil { - return err - } - - msg.Name = usermem.ByteOrder.Uint64(b[0:]) - msg.NameLen = usermem.ByteOrder.Uint32(b[8:]) - msg.Iov = usermem.ByteOrder.Uint64(b[16:]) - msg.IovLen = usermem.ByteOrder.Uint64(b[24:]) - msg.Control = usermem.ByteOrder.Uint64(b[32:]) - msg.ControlLen = usermem.ByteOrder.Uint64(b[40:]) - msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:])) - - return nil -} - // CaptureAddress allocates memory for and copies a socket address structure // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { @@ -146,10 +134,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, // writeAddress writes a sockaddr structure and its length to an output buffer // in the unstrusted address space range. If the address is bigger than the // buffer, it is truncated. -func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { +func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { // Get the buffer length. var bufLen uint32 - if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil { + if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil { return err } @@ -158,7 +146,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } // Write the length unconditionally. - if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil { + if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil { return err } @@ -171,7 +159,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } // Copy as much of the address as will fit in the buffer. - encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr) + encodedAddr := t.CopyScratchBuffer(addr.SizeBytes()) + addr.MarshalUnsafe(encodedAddr) if bufLen > uint32(len(encodedAddr)) { bufLen = uint32(len(encodedAddr)) } @@ -198,7 +187,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal s.SetFlags(fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, }) - defer s.DecRef() + defer s.DecRef(t) fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, @@ -233,8 +222,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } s1.SetFlags(fileFlags) s2.SetFlags(fileFlags) - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) // Create the FDs for the sockets. fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ @@ -245,10 +234,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } // Copy the file descriptors out. - if _, err := t.CopyOut(socks, fds); err != nil { + if _, err := primitive.CopyInt32SliceOut(t, socks, fds); err != nil { for _, fd := range fds { - if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + if file, _ := t.FDTable().Remove(t, fd); file != nil { + file.DecRef(t) } } return 0, nil, err @@ -268,7 +257,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -283,7 +272,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } blocking := !file.Flags().NonBlocking - return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS) + return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept @@ -299,7 +288,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -314,7 +303,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't @@ -358,7 +347,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -385,7 +374,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -414,7 +403,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -445,7 +434,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -454,8 +443,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } // Read the length. Reject negative values. - optLen := int32(0) - if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + var optLen int32 + if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil { return 0, nil, err } if optLen < 0 { @@ -469,12 +458,12 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } vLen := int32(binary.Size(v)) - if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil { return 0, nil, err } if v != nil { - if _, err := t.CopyOut(optValAddr, v); err != nil { + if _, err := v.CopyOut(t, optValAddr); err != nil { return 0, nil, err } } @@ -484,7 +473,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getSockOpt tries to handle common socket options, or dispatches to a specific // socket implementation. -func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { +func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) { if level == linux.SOL_SOCKET { switch name { case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: @@ -496,13 +485,16 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr use switch name { case linux.SO_TYPE: _, skType, _ := s.Type() - return int32(skType), nil + v := primitive.Int32(skType) + return &v, nil case linux.SO_DOMAIN: family, _, _ := s.Type() - return int32(family), nil + v := primitive.Int32(family) + return &v, nil case linux.SO_PROTOCOL: _, _, protocol := s.Type() - return int32(protocol), nil + v := primitive.Int32(protocol) + return &v, nil } } @@ -524,7 +516,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -539,7 +531,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) - if _, err := t.CopyIn(optValAddr, &buf); err != nil { + if _, err := t.CopyInBytes(optValAddr, buf); err != nil { return 0, nil, err } @@ -562,7 +554,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -590,7 +582,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -623,7 +615,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -676,7 +668,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -728,7 +720,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if !ok { return 0, nil, syserror.EFAULT } - if _, err = t.CopyOut(lp, uint32(n)); err != nil { + if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break } count++ @@ -743,7 +735,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { // Capture the message header and io vectors. var msg MessageHeader64 - if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + if _, err := msg.CopyIn(t, msgPtr); err != nil { return 0, err } @@ -766,16 +758,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { - return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { // Copy out the flags to the caller. - if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { return 0, err } } @@ -788,9 +780,9 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -812,17 +804,17 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } // Copy the control data to the caller. - if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { + if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { return 0, err } if len(controlData) > 0 { - if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil { + if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil { return 0, err } } // Copy out the flags to the caller. - if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { return 0, err } @@ -846,7 +838,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -875,9 +867,9 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) } // Copy the address to the caller. @@ -919,7 +911,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -957,7 +949,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -991,7 +983,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if !ok { return 0, nil, syserror.EFAULT } - if _, err = t.CopyOut(lp, uint32(n)); err != nil { + if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break } count++ @@ -1006,7 +998,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr usermem.Addr, flags int32) (uintptr, error) { // Capture the message header. var msg MessageHeader64 - if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + if _, err := msg.CopyIn(t, msgPtr); err != nil { return 0, err } @@ -1017,7 +1009,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme return 0, syserror.ENOBUFS } controlData = make([]byte, msg.ControlLen) - if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { + if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil { return 0, err } } @@ -1059,9 +1051,11 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) - err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) - if err != nil { - controlMessages.Release() + err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) + // Control messages should be released on error as well as for zero-length + // messages, which are discarded by the receiver. + if n == 0 || err != nil { + controlMessages.Release(t) } return uintptr(n), err } @@ -1079,7 +1073,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -1119,7 +1113,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) - return uintptr(n), handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file) + return uintptr(n), handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 77c78889d..46616c961 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -16,6 +16,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -101,7 +102,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.Flags().Read { return 0, nil, syserror.EBADF @@ -111,7 +112,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.Flags().Write { return 0, nil, syserror.EBADF @@ -141,7 +142,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Copy in the offset. var offset int64 - if _, err := t.CopyIn(offsetAddr, &offset); err != nil { + if _, err := primitive.CopyInt64In(t, offsetAddr, &offset); err != nil { return 0, nil, err } @@ -149,11 +150,11 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err = doSplice(t, outFile, inFile, fs.SpliceOpts{ Length: count, SrcOffset: true, - SrcStart: offset, + SrcStart: int64(offset), }, outFile.Flags().NonBlocking) // Copy out the new offset. - if _, err := t.CopyOut(offsetAddr, n+offset); err != nil { + if _, err := primitive.CopyInt64Out(t, offsetAddr, offset+n); err != nil { return 0, nil, err } } else { @@ -170,7 +171,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // We can only pass a single file to handleIOError, so pick inFile // arbitrarily. This is used only for debugging purposes. - return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "sendfile", inFile) + return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "sendfile", inFile) } // Splice implements splice(2). @@ -192,13 +193,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // The operation is non-blocking if anything is non-blocking. // @@ -228,7 +229,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } var offset int64 - if _, err := t.CopyIn(outOffset, &offset); err != nil { + if _, err := primitive.CopyInt64In(t, outOffset, &offset); err != nil { return 0, nil, err } @@ -246,7 +247,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } var offset int64 - if _, err := t.CopyIn(inOffset, &offset); err != nil { + if _, err := primitive.CopyInt64In(t, inOffset, &offset); err != nil { return 0, nil, err } @@ -280,7 +281,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "splice", inFile) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "splice", inFile) } // Tee imlements tee(2). @@ -300,13 +301,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // All files must be pipes. if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { @@ -333,5 +334,5 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo } // See above; inFile is chosen arbitrarily here. - return uintptr(n), nil, handleIOError(t, false, err, kernel.ERESTARTSYS, "tee", inFile) + return uintptr(n), nil, handleIOError(t, false, err, syserror.ERESTARTSYS, "tee", inFile) } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 46ebf27a2..cda29a8b5 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -58,7 +58,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -100,7 +100,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -158,7 +158,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) uattr, err := file.UnstableAttr(t) if err != nil { return 0, nil, err @@ -221,7 +221,7 @@ func statx(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr, statxAddr DevMajor: uint32(devMajor), DevMinor: devMinor, } - _, err := t.CopyOut(statxAddr, &s) + _, err := s.CopyOut(t, statxAddr) return err } @@ -249,7 +249,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, statfsImpl(t, file.Dirent, statfsAddr) } @@ -283,7 +283,7 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error { FragmentSize: d.Inode.StableAttr.BlockSize, // Leave other fields 0 like simple_statfs does. } - _, err = t.CopyOut(addr, &statfs) + _, err = statfs.CopyOut(t, addr) return err } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 5ad465ae3..048a21c6e 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -39,7 +39,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Use "sync-the-world" for now, it's guaranteed that fd is at least // on the root filesystem. @@ -54,10 +54,10 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) } // Fdatasync implements linux syscall fdatasync(2). @@ -70,10 +70,10 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) } // SyncFileRange implements linux syscall sync_file_rage(2) @@ -103,7 +103,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the // specified range that have already been submitted to the device @@ -135,7 +135,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel err = file.Fsync(t, offset, fs.FileMaxOffset, fs.SyncData) } - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) } // LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go index 297de052a..db3d924d9 100644 --- a/pkg/sentry/syscalls/linux/sys_sysinfo.go +++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go @@ -21,13 +21,17 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" ) -// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo. +// Sysinfo implements Linux syscall sysinfo(2). func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() mf := t.Kernel().MemoryFile() - mf.UpdateUsage() - _, totalUsage := usage.MemoryAccounting.Copy() + mfUsage, err := mf.TotalUsage() + if err != nil { + return 0, nil, err + } + memStats, _ := usage.MemoryAccounting.Copy() + totalUsage := mfUsage + memStats.Mapped totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) memFree := totalSize - totalUsage if memFree > totalSize { @@ -37,12 +41,12 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca // Only a subset of the fields in sysinfo_t make sense to return. si := linux.Sysinfo{ - Procs: uint16(len(t.PIDNamespace().Tasks())), + Procs: uint16(t.Kernel().TaskSet().Root.NumTasks()), Uptime: t.Kernel().MonotonicClock().Now().Seconds(), TotalRAM: totalSize, FreeRAM: memFree, Unit: 1, } - _, err := t.CopyOut(addr, si) + _, err = si.CopyOut(t, addr) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 00915fdde..39ca9ea97 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -19,6 +19,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsbridge" @@ -117,7 +118,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) var wd *fs.Dirent var executable fsbridge.File @@ -133,7 +134,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { @@ -155,7 +156,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user } } if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) } // Load the new TaskContext. @@ -262,7 +263,7 @@ func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { wopts.Events |= kernel.EventGroupContinue } if options&linux.WNOHANG == 0 { - wopts.BlockInterruptErr = kernel.ERESTARTSYS + wopts.BlockInterruptErr = syserror.ERESTARTSYS } if options&linux.WNOTHREAD == 0 { wopts.SiblingChildren = true @@ -311,13 +312,13 @@ func wait4(t *kernel.Task, pid int, statusAddr usermem.Addr, options int, rusage return 0, err } if statusAddr != 0 { - if _, err := t.CopyOut(statusAddr, wr.Status); err != nil { + if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil { return 0, err } } if rusageAddr != 0 { ru := getrusage(wr.Task, linux.RUSAGE_BOTH) - if _, err := t.CopyOut(rusageAddr, &ru); err != nil { + if _, err := ru.CopyOut(t, rusageAddr); err != nil { return 0, err } } @@ -395,14 +396,14 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // as well. if infop != 0 { var si arch.SignalInfo - _, err = t.CopyOut(infop, &si) + _, err = si.CopyOut(t, infop) } } return 0, nil, err } if rusageAddr != 0 { ru := getrusage(wr.Task, linux.RUSAGE_BOTH) - if _, err := t.CopyOut(rusageAddr, &ru); err != nil { + if _, err := ru.CopyOut(t, rusageAddr); err != nil { return 0, nil, err } } @@ -441,7 +442,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal default: t.Warningf("waitid got incomprehensible wait status %d", s) } - _, err = t.CopyOut(infop, &si) + _, err = si.CopyOut(t, infop) return 0, nil, err } @@ -558,9 +559,7 @@ func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // third argument to this system call is nowadays unused. if cpu != 0 { - buf := t.CopyScratchBuffer(4) - usermem.ByteOrder.PutUint32(buf, uint32(t.CPU())) - if _, err := t.CopyOutBytes(cpu, buf); err != nil { + if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil { return 0, nil, err } } diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index 2d2aa0819..c5054d2f1 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -19,6 +19,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -168,7 +169,7 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(r), nil, nil } - if _, err := t.CopyOut(addr, r); err != nil { + if _, err := r.CopyOut(t, addr); err != nil { return 0, nil, err } return uintptr(r), nil, nil @@ -213,7 +214,7 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error return nil } - return syserror.ConvertIntr(err, kernel.ERESTARTNOHAND) + return syserror.ConvertIntr(err, syserror.ERESTARTNOHAND) } // clockNanosleepFor blocks for a specified duration. @@ -254,7 +255,7 @@ func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem use duration: remaining, rem: rem, }) - return kernel.ERESTART_RESTARTBLOCK + return syserror.ERESTART_RESTARTBLOCK default: panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) } @@ -334,8 +335,8 @@ func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. // Ask the time package for the timezone. _, offset := time.Now().Zone() // This int32 array mimics linux's struct timezone. - timezone := [2]int32{-int32(offset) / 60, 0} - _, err := t.CopyOut(tz, timezone) + timezone := []int32{-int32(offset) / 60, 0} + _, err := primitive.CopyInt32SliceOut(t, tz, timezone) return 0, nil, err } return 0, nil, nil diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index a4c400f87..45eef4feb 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -21,81 +21,63 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) const nsecPerSec = int64(time.Second) -// copyItimerValIn copies an ItimerVal from the untrusted app range to the -// kernel. The ItimerVal may be either 32 or 64 bits. -// A NULL address is allowed because because Linux allows -// setitimer(which, NULL, &old_value) which disables the timer. -// There is a KERN_WARN message saying this misfeature will be removed. -// However, that hasn't happened as of 3.19, so we continue to support it. -func copyItimerValIn(t *kernel.Task, addr usermem.Addr) (linux.ItimerVal, error) { - if addr == usermem.Addr(0) { - return linux.ItimerVal{}, nil - } - - switch t.Arch().Width() { - case 8: - // Native size, just copy directly. - var itv linux.ItimerVal - if _, err := t.CopyIn(addr, &itv); err != nil { - return linux.ItimerVal{}, err - } - - return itv, nil - default: - return linux.ItimerVal{}, syserror.ENOSYS - } -} - -// copyItimerValOut copies an ItimerVal to the untrusted app range. -// The ItimerVal may be either 32 or 64 bits. -// A NULL address is allowed, in which case no copy takes place -func copyItimerValOut(t *kernel.Task, addr usermem.Addr, itv *linux.ItimerVal) error { - if addr == usermem.Addr(0) { - return nil - } - - switch t.Arch().Width() { - case 8: - // Native size, just copy directly. - _, err := t.CopyOut(addr, itv) - return err - default: - return syserror.ENOSYS - } -} - // Getitimer implements linux syscall getitimer(2). func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + if t.Arch().Width() != 8 { + // Definition of linux.ItimerVal assumes 64-bit architecture. + return 0, nil, syserror.ENOSYS + } + timerID := args[0].Int() - val := args[1].Pointer() + addr := args[1].Pointer() olditv, err := t.Getitimer(timerID) if err != nil { return 0, nil, err } - return 0, nil, copyItimerValOut(t, val, &olditv) + // A NULL address is allowed, in which case no copy out takes place. + if addr == 0 { + return 0, nil, nil + } + _, err = olditv.CopyOut(t, addr) + return 0, nil, err } // Setitimer implements linux syscall setitimer(2). func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - timerID := args[0].Int() - newVal := args[1].Pointer() - oldVal := args[2].Pointer() + if t.Arch().Width() != 8 { + // Definition of linux.ItimerVal assumes 64-bit architecture. + return 0, nil, syserror.ENOSYS + } - newitv, err := copyItimerValIn(t, newVal) - if err != nil { - return 0, nil, err + timerID := args[0].Int() + newAddr := args[1].Pointer() + oldAddr := args[2].Pointer() + + var newitv linux.ItimerVal + // A NULL address is allowed because because Linux allows + // setitimer(which, NULL, &old_value) which disables the timer. There is a + // KERN_WARN message saying this misfeature will be removed. However, that + // hasn't happened as of 3.19, so we continue to support it. + if newAddr != 0 { + if _, err := newitv.CopyIn(t, newAddr); err != nil { + return 0, nil, err + } } olditv, err := t.Setitimer(timerID, newitv) if err != nil { return 0, nil, err } - return 0, nil, copyItimerValOut(t, oldVal, &olditv) + // A NULL address is allowed, in which case no copy out takes place. + if oldAddr == 0 { + return 0, nil, nil + } + _, err = olditv.CopyOut(t, oldAddr) + return 0, nil, err } // Alarm implements linux syscall alarm(2). @@ -131,7 +113,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S var sev *linux.Sigevent if sevp != 0 { sev = &linux.Sigevent{} - if _, err = t.CopyIn(sevp, sev); err != nil { + if _, err = sev.CopyIn(t, sevp); err != nil { return 0, nil, err } } @@ -141,7 +123,7 @@ func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - if _, err := t.CopyOut(timerIDp, &id); err != nil { + if _, err := id.CopyOut(t, timerIDp); err != nil { t.IntervalTimerDelete(id) return 0, nil, err } @@ -157,7 +139,7 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. oldValAddr := args[3].Pointer() var newVal linux.Itimerspec - if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + if _, err := newVal.CopyIn(t, newValAddr); err != nil { return 0, nil, err } oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0) @@ -165,9 +147,8 @@ func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. return 0, nil, err } if oldValAddr != 0 { - if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { - return 0, nil, err - } + _, err = oldVal.CopyOut(t, oldValAddr) + return 0, nil, err } return 0, nil, nil } @@ -181,7 +162,7 @@ func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if err != nil { return 0, nil, err } - _, err = t.CopyOut(curValAddr, &curVal) + _, err = curVal.CopyOut(t, curValAddr) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index cf49b43db..cadd9d348 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -43,7 +43,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } f := timerfd.NewFile(t, c) - defer f.DecRef() + defer f.DecRef(t) f.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) @@ -73,7 +73,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { @@ -81,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } var newVal linux.Itimerspec - if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + if _, err := newVal.CopyIn(t, newValAddr); err != nil { return 0, nil, err } newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tf.Clock()) @@ -91,7 +91,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne tm, oldS := tf.SetTime(newS) if oldValAddr != 0 { oldVal := ktime.ItimerspecFromSetting(tm, oldS) - if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + if _, err := oldVal.CopyOut(t, oldValAddr); err != nil { return 0, nil, err } } @@ -107,7 +107,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { @@ -116,6 +116,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne tm, s := tf.GetTime() curVal := ktime.ItimerspecFromSetting(tm, s) - _, err := t.CopyOut(curValAddr, &curVal) + _, err := curVal.CopyOut(t, curValAddr) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_tls_amd64.go b/pkg/sentry/syscalls/linux/sys_tls_amd64.go index b3eb96a1c..6ddd30d5c 100644 --- a/pkg/sentry/syscalls/linux/sys_tls_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_tls_amd64.go @@ -18,6 +18,7 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -30,17 +31,19 @@ func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys case linux.ARCH_GET_FS: addr := args[1].Pointer() fsbase := t.Arch().TLS() - _, err := t.CopyOut(addr, uint64(fsbase)) - if err != nil { - return 0, nil, err + switch t.Arch().Width() { + case 8: + if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil { + return 0, nil, err + } + default: + return 0, nil, syserror.ENOSYS } - case linux.ARCH_SET_FS: fsbase := args[1].Uint64() if !t.Arch().SetTLS(uintptr(fsbase)) { return 0, nil, syserror.EPERM } - case linux.ARCH_GET_GS, linux.ARCH_SET_GS: t.Kernel().EmitUnimplementedEvent(t) fallthrough diff --git a/pkg/sentry/syscalls/linux/sys_utsname.go b/pkg/sentry/syscalls/linux/sys_utsname.go index e9d702e8e..66c5974f5 100644 --- a/pkg/sentry/syscalls/linux/sys_utsname.go +++ b/pkg/sentry/syscalls/linux/sys_utsname.go @@ -46,7 +46,7 @@ func Uname(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Copy out the result. va := args[0].Pointer() - _, err := t.CopyOut(va, u) + _, err := u.CopyOut(t, va) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 6ec0de96e..95bfe6606 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -48,7 +48,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -71,7 +71,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "write", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "write", file) } // Pwrite64 implements linux syscall pwrite64(2). @@ -85,7 +85,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -118,7 +118,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) } // Writev implements linux syscall writev(2). @@ -131,7 +131,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -148,7 +148,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) } // Pwritev implements linux syscall pwritev(2). @@ -162,7 +162,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -189,7 +189,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements linux syscall pwritev2(2). @@ -215,7 +215,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -250,12 +250,12 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if offset == -1 { n, err := writev(t, file, src) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) } n, err := pwritev(t, file, src, offset) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, handleIOError(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) } func writev(t *kernel.Task, f *fs.File, src usermem.IOSequence) (int64, error) { diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index c24946160..97474fd3c 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -49,7 +49,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size) if err != nil { @@ -153,7 +153,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags) } @@ -270,7 +270,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := listXattr(t, f.Dirent, listAddr, size) if err != nil { @@ -384,7 +384,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, removeXattr(t, f.Dirent, nameAddr) } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 9c8b44f64..9ee766552 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -5,6 +5,7 @@ package(licenses = ["notice"]) go_library( name = "vfs2", srcs = [ + "aio.go", "epoll.go", "eventfd.go", "execve.go", @@ -14,8 +15,10 @@ go_library( "getdents.go", "inotify.go", "ioctl.go", + "lock.go", "memfd.go", "mmap.go", + "mount.go", "path.go", "pipe.go", "poll.go", @@ -38,9 +41,14 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/bits", + "//pkg/context", "//pkg/fspath", "//pkg/gohacks", + "//pkg/log", + "//pkg/marshal", + "//pkg/marshal/primitive", "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", @@ -49,11 +57,13 @@ go_library( "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/fasync", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/loader", "//pkg/sentry/memmap", + "//pkg/sentry/mm", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go new file mode 100644 index 000000000..6d0a38330 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -0,0 +1,226 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/marshal/primitive" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/mm" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// IoSubmit implements linux syscall io_submit(2). +func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + id := args[0].Uint64() + nrEvents := args[1].Int() + addr := args[2].Pointer() + + if nrEvents < 0 { + return 0, nil, syserror.EINVAL + } + + for i := int32(0); i < nrEvents; i++ { + // Copy in the callback address. + var cbAddr usermem.Addr + switch t.Arch().Width() { + case 8: + var cbAddrP primitive.Uint64 + if _, err := cbAddrP.CopyIn(t, addr); err != nil { + if i > 0 { + // Some successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + cbAddr = usermem.Addr(cbAddrP) + default: + return 0, nil, syserror.ENOSYS + } + + // Copy in this callback. + var cb linux.IOCallback + if _, err := cb.CopyIn(t, cbAddr); err != nil { + if i > 0 { + // Some have been successful. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Process this callback. + if err := submitCallback(t, id, &cb, cbAddr); err != nil { + if i > 0 { + // Partial success. + return uintptr(i), nil, nil + } + // Nothing done. + return 0, nil, err + } + + // Advance to the next one. + addr += usermem.Addr(t.Arch().Width()) + } + + return uintptr(nrEvents), nil, nil +} + +// submitCallback processes a single callback. +func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr usermem.Addr) error { + if cb.Reserved2 != 0 { + return syserror.EINVAL + } + + fd := t.GetFileVFS2(cb.FD) + if fd == nil { + return syserror.EBADF + } + defer fd.DecRef(t) + + // Was there an eventFD? Extract it. + var eventFD *vfs.FileDescription + if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { + eventFD = t.GetFileVFS2(cb.ResFD) + if eventFD == nil { + return syserror.EBADF + } + defer eventFD.DecRef(t) + + // Check that it is an eventfd. + if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { + return syserror.EINVAL + } + } + + ioseq, err := memoryFor(t, cb) + if err != nil { + return err + } + + // Check offset for reads/writes. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + if cb.Offset < 0 { + return syserror.EINVAL + } + } + + // Prepare the request. + aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) + if !ok { + return syserror.EINVAL + } + if ready := aioCtx.Prepare(); !ready { + // Context is busy. + return syserror.EAGAIN + } + + if eventFD != nil { + // The request is set. Make sure there's a ref on the file. + // + // This is necessary when the callback executes on completion, + // which is also what will release this reference. + eventFD.IncRef() + } + + // Perform the request asynchronously. + fd.IncRef() + t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) + return nil +} + +func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr usermem.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { + return func(ctx context.Context) { + // Release references after completing the callback. + defer fd.DecRef(ctx) + if eventFD != nil { + defer eventFD.DecRef(ctx) + } + + if aioCtx.Dead() { + aioCtx.CancelPendingRequest() + return + } + ev := &linux.IOEvent{ + Data: cb.Data, + Obj: uint64(cbAddr), + } + + var err error + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: + ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) + case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: + ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: + err = fd.Sync(ctx) + } + + // Update the result. + if err != nil { + err = slinux.HandleIOErrorVFS2(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) + ev.Result = -int64(kernel.ExtractErrno(err, 0)) + } + + // Queue the result for delivery. + aioCtx.FinishRequest(ev) + + // Notify the event file if one was specified. This needs to happen + // *after* queueing the result to avoid racing with the thread we may + // wake up. + if eventFD != nil { + eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) + } + } +} + +// memoryFor returns appropriate memory for the given callback. +func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { + bytes := int(cb.Bytes) + if bytes < 0 { + // Linux also requires that this field fit in ssize_t. + return usermem.IOSequence{}, syserror.EINVAL + } + + // Since this I/O will be asynchronous with respect to t's task goroutine, + // we have no guarantee that t's AddressSpace will be active during the + // I/O. + switch cb.OpCode { + case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: + return t.SingleIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: + return t.IovecsIOSequence(usermem.Addr(cb.Buf), bytes, usermem.IOOpts{ + AddressSpaceActive: false, + }) + + case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: + return usermem.IOSequence{}, nil + + default: + // Not a supported command. + return usermem.IOSequence{}, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 34c90ae3e..d0cbb77eb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -24,7 +24,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -37,11 +36,11 @@ func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, @@ -62,11 +61,11 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -86,7 +85,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL @@ -95,7 +94,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if epfile == file { return 0, nil, syserror.EINVAL } @@ -135,56 +134,32 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL } - // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent, - // maxEvents), so that the buffer can be allocated on the stack. + // Allocate space for a few events on the stack for the common case in + // which we don't have too many events. var ( - events [16]linux.EpollEvent - total int + eventsArr [16]linux.EpollEvent ch chan struct{} haveDeadline bool deadline ktime.Time ) for { - batchEvents := len(events) - if batchEvents > maxEvents { - batchEvents = maxEvents - } - n := ep.ReadEvents(events[:batchEvents]) - maxEvents -= n - if n != 0 { - // Copy what we read out. - copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n]) + events := ep.ReadEvents(eventsArr[:0], maxEvents) + if len(events) != 0 { + copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events) copiedEvents := copiedBytes / sizeofEpollEvent // rounded down - eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent) - total += copiedEvents - if err != nil { - if total != 0 { - return uintptr(total), nil, nil - } - return 0, nil, err - } - // If we've filled the application's event buffer, we're done. - if maxEvents == 0 { - return uintptr(total), nil, nil - } - // Loop if we read a full batch, under the expectation that there - // may be more events to read. - if n == batchEvents { - continue + if copiedEvents != 0 { + return uintptr(copiedEvents), nil, nil } + return 0, nil, err } - // We get here if n != batchEvents. If we read any number of events - // (just now, or in a previous iteration of this loop), or if timeout - // is 0 (such that epoll_wait should be non-blocking), return the - // events we've read so far to the application. - if total != 0 || timeout == 0 { - return uintptr(total), nil, nil + if timeout == 0 { + return 0, nil, nil } // In the first iteration of this loop, register with the epoll // instance for readability events, but then immediately continue the @@ -207,8 +182,6 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if err == syserror.ETIMEDOUT { err = nil } - // total must be 0 since otherwise we would have returned - // above. return 0, nil, err } } diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index aff1a2070..807f909da 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -38,11 +38,11 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) + eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } - defer eventfd.DecRef() + defer eventfd.DecRef(t) fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index aef0078a8..c8ce2aabc 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -71,7 +71,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) var executable fsbridge.File closeOnExec := false if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { @@ -90,7 +90,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } start := dirfile.VirtualDentry() start.IncRef() - dirfile.DecRef() + dirfile.DecRef(t) closeOnExec = dirfileFlags.CloseOnExec file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, @@ -101,19 +101,18 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user Flags: linux.O_RDONLY, FileExec: true, }) - start.DecRef() + start.DecRef(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) executable = fsbridge.NewVFSFile(file) } // Load the new TaskContext. - mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change - defer mntns.DecRef() + mntns := t.MountNamespaceVFS2() wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ Opener: fsbridge.NewVFSLookup(mntns, root, wd), diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index ca0f7fd1e..36e89700e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,10 +17,13 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,11 +34,11 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that Remove provides a reference on the file that we may use to // flush. It is still active until we drop the final reference below // (and other reference-holding operations complete). - _, file := t.FDTable().Remove(fd) + _, file := t.FDTable().Remove(t, fd) if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.OnClose(t) return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -49,7 +52,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -69,7 +72,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - file.DecRef() + file.DecRef(t) return uintptr(newfd), nil, nil } @@ -98,7 +101,7 @@ func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -118,7 +121,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -134,14 +137,49 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(flags.ToLinuxFDFlags()), nil, nil case linux.F_SETFD: flags := args[2].Uint() - t.FDTable().SetFlags(fd, kernel.FDFlags{ + err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{ CloseOnExec: flags&linux.FD_CLOEXEC != 0, }) - return 0, nil, nil + return 0, nil, err case linux.F_GETFL: return uintptr(file.StatusFlags()), nil, nil case linux.F_SETFL: return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + case linux.F_GETOWN: + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { + return 0, nil, nil + } + if owner.Type == linux.F_OWNER_PGRP { + return uintptr(-owner.PID), nil, nil + } + return uintptr(owner.PID), nil, nil + case linux.F_SETOWN: + who := args[2].Int() + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + return 0, nil, setAsyncOwner(t, file, ownerType, who) + case linux.F_GETOWN_EX: + owner, hasOwner := getAsyncOwner(t, file) + if !hasOwner { + return 0, nil, nil + } + _, err := owner.CopyOut(t, args[2].Pointer()) + return 0, nil, err + case linux.F_SETOWN_EX: + var owner linux.FOwnerEx + _, err := owner.CopyIn(t, args[2].Pointer()) + if err != nil { + return 0, nil, err + } + return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID) case linux.F_SETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { @@ -167,8 +205,151 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err + case linux.F_SETLK, linux.F_SETLKW: + return 0, nil, posixLock(t, args, file, cmd) default: - // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. + // Everything else is not yet supported. return 0, nil, syserror.EINVAL } } + +func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) { + a := fd.AsyncHandler() + if a == nil { + return linux.FOwnerEx{}, false + } + + ot, otg, opg := a.(*fasync.FileAsync).Owner() + switch { + case ot != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_TID, + PID: int32(t.PIDNamespace().IDOfTask(ot)), + }, true + case otg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PID, + PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)), + }, true + case opg != nil: + return linux.FOwnerEx{ + Type: linux.F_OWNER_PGRP, + PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)), + }, true + default: + return linux.FOwnerEx{}, true + } +} + +func setAsyncOwner(t *kernel.Task, fd *vfs.FileDescription, ownerType, pid int32) error { + switch ownerType { + case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP: + // Acceptable type. + default: + return syserror.EINVAL + } + + a := fd.SetAsyncHandler(fasync.NewVFS2).(*fasync.FileAsync) + if pid == 0 { + a.ClearOwner() + return nil + } + + switch ownerType { + case linux.F_OWNER_TID: + task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) + if task == nil { + return syserror.ESRCH + } + a.SetOwnerTask(t, task) + return nil + case linux.F_OWNER_PID: + tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid)) + if tg == nil { + return syserror.ESRCH + } + a.SetOwnerThreadGroup(t, tg) + return nil + case linux.F_OWNER_PGRP: + pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid)) + if pg == nil { + return syserror.ESRCH + } + a.SetOwnerProcessGroup(t, pg) + return nil + default: + return syserror.EINVAL + } +} + +func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := flock.CopyIn(t, flockAddr); err != nil { + return err + } + + var blocker lock.Blocker + if cmd == linux.F_SETLKW { + blocker = t + } + + switch flock.Type { + case linux.F_RDLCK: + if !file.IsReadable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_WRLCK: + if !file.IsWritable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_UNLCK: + return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence) + + default: + return syserror.EINVAL + } +} + +// Fadvise64 implements fadvise64(2). +// This implementation currently ignores the provided advice. +func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[2].Int64() + advice := args[3].Int() + + // Note: offset is allowed to be negative. + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef(t) + + // If the FD refers to a pipe or FIFO, return error. + if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { + return 0, nil, syserror.ESPIPE + } + + switch advice { + case linux.POSIX_FADV_NORMAL: + case linux.POSIX_FADV_RANDOM: + case linux.POSIX_FADV_SEQUENTIAL: + case linux.POSIX_FADV_WILLNEED: + case linux.POSIX_FADV_DONTNEED: + case linux.POSIX_FADV_NOREUSE: + default: + return 0, nil, syserror.EINVAL + } + + // Sure, whatever. + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 46d3e189c..01e0f9010 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -56,7 +56,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -66,7 +66,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) } @@ -95,7 +95,7 @@ func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), }) @@ -106,7 +106,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall addr := args[0].Pointer() mode := args[1].ModeT() dev := args[2].Uint() - return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev) + return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev) } // Mknodat implements Linux syscall mknodat(2). @@ -115,10 +115,10 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca addr := args[1].Pointer() mode := args[2].ModeT() dev := args[3].Uint() - return 0, nil, mknodat(t, dirfd, addr, mode, dev) + return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev) } -func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error { +func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error { path, err := copyInPath(t, addr) if err != nil { return err @@ -127,10 +127,15 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) + + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + if mode.FileType() == 0 { + mode |= linux.ModeRegular + } major, minor := linux.DecodeDeviceID(dev) return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ - Mode: linux.FileMode(mode &^ t.FSContext().Umask()), + Mode: mode &^ linux.FileMode(t.FSContext().Umask()), DevMajor: uint32(major), DevMinor: minor, }) @@ -169,7 +174,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ Flags: flags | linux.O_LARGEFILE, @@ -178,7 +183,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -222,7 +227,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -232,7 +237,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ Flags: flags, @@ -254,7 +259,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) } @@ -273,7 +278,7 @@ func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) } @@ -313,6 +318,9 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath if err != nil { return err } + if len(target) == 0 { + return syserror.ENOENT + } linkpath, err := copyInPath(t, linkpathAddr) if err != nil { return err @@ -321,6 +329,6 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) } diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go index 317409a18..a7d4d2a36 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -31,8 +31,8 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal root := t.FSContext().RootDirectoryVFS2() wd := t.FSContext().WorkingDirectoryVFS2() s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) - root.DecRef() - wd.DecRef() + root.DecRef(t) + wd.DecRef(t) if err != nil { return 0, nil, err } @@ -67,7 +67,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -75,8 +75,8 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -88,7 +88,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -96,8 +96,8 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -117,7 +117,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -125,7 +125,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetRootDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetRootDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index c7c7bf7ce..5517595b5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -44,7 +44,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) cb := getGetdentsCallback(t, addr, size, isGetdents64) err := file.IterDirents(t, cb) diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 7d50b6a16..11753d8e5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -35,7 +35,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if err != nil { return 0, nil, err } - defer ino.DecRef() + defer ino.DecRef(t) fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -66,7 +66,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. - f.DecRef() + f.DecRef(t) return nil, nil, syserror.EINVAL } @@ -81,7 +81,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) - if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + if mask&linux.ALL_INOTIFY_BITS == 0 { return 0, nil, syserror.EINVAL } @@ -96,7 +96,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer f.DecRef() + defer f.DecRef(t) path, err := copyInPath(t, addr) if err != nil { @@ -109,15 +109,18 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) if err != nil { return 0, nil, err } - defer d.DecRef() + defer d.DecRef(t) - fd = ino.AddWatch(d.Dentry(), mask) - return uintptr(fd), nil, err + fd, err = ino.AddWatch(d.Dentry(), mask) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil } // InotifyRmWatch implements the inotify_rm_watch() syscall. @@ -129,6 +132,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer f.DecRef() - return 0, nil, ino.RmWatch(wd) + defer f.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 5a2418da9..2806c3f6f 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -15,6 +15,8 @@ package vfs2 import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -28,7 +30,78 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) + + // Handle ioctls that apply to all FDs. + switch args[1].Int() { + case linux.FIONCLEX: + t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{ + CloseOnExec: false, + }) + return 0, nil, nil + + case linux.FIOCLEX: + t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{ + CloseOnExec: true, + }) + return 0, nil, nil + + case linux.FIONBIO: + var set int32 + if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_NONBLOCK + } else { + flags &^= linux.O_NONBLOCK + } + return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) + + case linux.FIOASYNC: + var set int32 + if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_ASYNC + } else { + flags &^= linux.O_ASYNC + } + file.SetStatusFlags(t, t.Credentials(), flags) + return 0, nil, nil + + case linux.FIOGETOWN, linux.SIOCGPGRP: + var who int32 + owner, hasOwner := getAsyncOwner(t, file) + if hasOwner { + if owner.Type == linux.F_OWNER_PGRP { + who = -owner.PID + } else { + who = owner.PID + } + } + _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who) + return 0, nil, err + + case linux.FIOSETOWN, linux.SIOCSPGRP: + var who int32 + if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil { + return 0, nil, err + } + ownerType := int32(linux.F_OWNER_PID) + if who < 0 { + // Check for overflow before flipping the sign. + if who-1 > who { + return 0, nil, syserror.EINVAL + } + ownerType = linux.F_OWNER_PGRP + who = -who + } + return 0, nil, setAsyncOwner(t, file, ownerType, who) + } ret, err := file.Ioctl(t, t.MemoryManager(), args) return ret, nil, err diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go new file mode 100644 index 000000000..b910b5a74 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -0,0 +1,64 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Flock implements linux syscall flock(2). +func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + operation := args[1].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + // flock(2): EBADF fd is not an open file descriptor. + return 0, nil, syserror.EBADF + } + defer file.DecRef(t) + + nonblocking := operation&linux.LOCK_NB != 0 + operation &^= linux.LOCK_NB + + var blocker lock.Blocker + if !nonblocking { + blocker = t + } + + switch operation { + case linux.LOCK_EX: + if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_SH: + if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_UN: + if err := file.UnlockBSD(t); err != nil { + return 0, nil, err + } + default: + // flock(2): EINVAL operation is invalid. + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go index bbe248d17..c4c0f9e0a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/memfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -47,10 +47,11 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } shmMount := t.Kernel().ShmMount() - file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) if err != nil { return 0, nil, err } + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: cloExec, diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index 60a43f0a0..9d9dbf775 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/syserror" @@ -61,7 +62,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -71,7 +72,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { @@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } + } else if shared { + // Back shared anonymous mappings with an anonymous tmpfs file. + opts.Offset = 0 + file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length) + if err != nil { + return 0, nil, err + } + defer file.DecRef(t) + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } } rv, err := t.MemoryManager().MMap(t, opts) diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go new file mode 100644 index 000000000..769c9b92f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -0,0 +1,150 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mount implements Linux syscall mount(2). +func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sourceAddr := args[0].Pointer() + targetAddr := args[1].Pointer() + typeAddr := args[2].Pointer() + flags := args[3].Uint64() + dataAddr := args[4].Pointer() + + // For null-terminated strings related to mount(2), Linux copies in at most + // a page worth of data. See fs/namespace.c:copy_mount_string(). + fsType, err := t.CopyInString(typeAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + source, err := t.CopyInString(sourceAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + targetPath, err := copyInPath(t, targetAddr) + if err != nil { + return 0, nil, err + } + + data := "" + if dataAddr != 0 { + // In Linux, a full page is always copied in regardless of null + // character placement, and the address is passed to each file system. + // Most file systems always treat this data as a string, though, and so + // do all of the ones we implement. + data, err = t.CopyInString(dataAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + } + + // Ignore magic value that was required before Linux 2.4. + if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { + flags = flags &^ linux.MS_MGC_MSK + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | + linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | + linux.MS_UNBINDABLE | linux.MS_MOVE + + // Silently allow MS_NOSUID, since we don't implement set-id bits + // anyway. + const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME + + // Linux just allows passing any flags to mount(2) - it won't fail when + // unknown or unsupported flags are passed. Since we don't implement + // everything, we fail explicitly on flags that are unimplemented. + if flags&(unsupportedOps|unsupportedFlags) != 0 { + return 0, nil, syserror.EINVAL + } + + var opts vfs.MountOptions + if flags&linux.MS_NOATIME == linux.MS_NOATIME { + opts.Flags.NoATime = true + } + if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { + opts.Flags.NoExec = true + } + if flags&linux.MS_NODEV == linux.MS_NODEV { + opts.Flags.NoDev = true + } + if flags&linux.MS_NOSUID == linux.MS_NOSUID { + opts.Flags.NoSUID = true + } + if flags&linux.MS_RDONLY == linux.MS_RDONLY { + opts.ReadOnly = true + } + opts.GetFilesystemOptions.Data = data + + target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer target.Release(t) + _, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) + return 0, nil, err +} + +// Umount2 implements Linux syscall umount2(2). +func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + // + // Currently, this is always the init task's user namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE + if flags&unsupported != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release(t) + + opts := vfs.UmountOptions{ + Flags: uint32(flags), + } + + return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 97da6c647..90a511d9a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -42,7 +42,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA haveStartRef := false if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.ENOENT } if dirfd == linux.AT_FDCWD { @@ -51,13 +51,13 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.EBADF } start = dirfile.VirtualDentry() start.IncRef() haveStartRef = true - dirfile.DecRef() + dirfile.DecRef(t) } } return taskPathOperation{ @@ -71,10 +71,10 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA }, nil } -func (tpop *taskPathOperation) Release() { - tpop.pop.Root.DecRef() +func (tpop *taskPathOperation) Release(t *kernel.Task) { + tpop.pop.Root.DecRef(t) if tpop.haveStartRef { - tpop.pop.Start.DecRef() + tpop.pop.Start.DecRef(t) tpop.haveStartRef = false } } diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go index 4a01e4209..ee38fdca0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/pipe.go +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -16,6 +16,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -42,8 +43,8 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { return syserror.EINVAL } r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(t) + defer w.DecRef(t) fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -51,10 +52,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { if err != nil { return err } - if _, err := t.CopyOut(addr, fds); err != nil { + if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil { for _, fd := range fds { - if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + if _, file := t.FDTable().Remove(t, fd); file != nil { + file.DecRef(t) } } return err diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index ff1b25d7b..c22e4ce54 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -73,7 +73,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -85,11 +85,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -110,7 +110,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -165,7 +165,7 @@ func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD pfd := make([]linux.PollFD, nfds) if nfds > 0 { - if _, err := t.CopyIn(addr, &pfd); err != nil { + if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil { return nil, err } } @@ -192,7 +192,7 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) // The poll entries are copied out regardless of whether // any are set or not. This aligns with the Linux behavior. if nfds > 0 && err == nil { - if _, err := t.CopyOut(addr, pfd); err != nil { + if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil { return remainingTimeout, 0, err } } @@ -205,7 +205,7 @@ func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialBy set := make([]byte, nBytes) if addr != 0 { - if _, err := t.CopyIn(addr, &set); err != nil { + if _, err := t.CopyInBytes(addr, set); err != nil { return nil, err } // If we only use part of the last byte, mask out the extraneous bits. @@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { @@ -332,19 +332,19 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add // Copy updated vectors back. if readFDs != 0 { - if _, err := t.CopyOut(readFDs, r); err != nil { + if _, err := t.CopyOutBytes(readFDs, r); err != nil { return 0, err } } if writeFDs != 0 { - if _, err := t.CopyOut(writeFDs, w); err != nil { + if _, err := t.CopyOutBytes(writeFDs, w); err != nil { return 0, err } } if exceptFDs != 0 { - if _, err := t.CopyOut(exceptFDs, e); err != nil { + if _, err := t.CopyOutBytes(exceptFDs, e); err != nil { return 0, err } } @@ -415,7 +415,7 @@ func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration nfds: nfds, timeout: remainingTimeout, }) - return 0, kernel.ERESTART_RESTARTBLOCK + return 0, syserror.ERESTART_RESTARTBLOCK } return n, err } @@ -462,7 +462,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Note that this means that if err is nil but copyErr is not, copyErr is // ignored. This is consistent with Linux. if err == syserror.EINTR && copyErr == nil { - err = kernel.ERESTARTNOHAND + err = syserror.ERESTARTNOHAND } return n, nil, err } @@ -492,11 +492,17 @@ func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) // See comment in Ppoll. if err == syserror.EINTR && copyErr == nil { - err = kernel.ERESTARTNOHAND + err = syserror.ERESTARTNOHAND } return n, nil, err } +// +marshal +type sigSetWithSize struct { + sigsetAddr uint64 + sizeofSigset uint64 +} + // Pselect implements linux syscall pselect(2). func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { nfds := int(args[0].Int()) // select(2) uses an int. @@ -533,17 +539,11 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) // See comment in Ppoll. if err == syserror.EINTR && copyErr == nil { - err = kernel.ERESTARTNOHAND + err = syserror.ERESTARTNOHAND } return n, nil, err } -// +marshal -type sigSetWithSize struct { - sigsetAddr uint64 - sizeofSigset uint64 -} - // copyTimespecInToDuration copies a Timespec from the untrusted app range, // validates it and converts it to a Duration. // diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 7f9debd4a..b77b29dcc 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -44,7 +44,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -62,7 +62,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file) } // Readv implements Linux syscall readv(2). @@ -75,7 +75,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -87,14 +87,14 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := read(t, file, dst, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) } func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -174,7 +174,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) } // Preadv implements Linux syscall preadv(2). @@ -188,7 +188,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -205,7 +205,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) } // Preadv2 implements Linux syscall preadv2(2). @@ -226,7 +226,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -251,14 +251,14 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err = pread(t, file, dst, offset, opts) } t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) } func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -314,7 +314,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -332,7 +332,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file) } // Writev implements Linux syscall writev(2). @@ -345,7 +345,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -357,14 +357,14 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal n, err := write(t, file, src, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) } func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return total, err } @@ -421,7 +421,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -444,7 +444,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) } // Pwritev implements Linux syscall pwritev(2). @@ -458,7 +458,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -475,7 +475,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) } // Pwritev2 implements Linux syscall pwritev2(2). @@ -496,7 +496,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -521,14 +521,14 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc n, err = pwrite(t, file, src, offset, opts) } t.IOUsage().AccountWriteSyscall(n) - return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) } func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -601,8 +601,41 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newoff, err := file.Seek(t, offset, whence) return uintptr(newoff), nil, err } + +// Readahead implements readahead(2). +func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef(t) + + // Check that the file is readable. + if !file.IsReadable() { + return 0, nil, syserror.EBADF + } + + // Check that the size is valid. + if int(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the offset is legitimate and does not overflow. + if offset < 0 || offset+int64(size) < 0 { + return 0, nil, syserror.EINVAL + } + + // Return EINVAL; if the underlying file type does not support readahead, + // then Linux will return EINVAL to indicate as much. In the future, we + // may extend this function to actually support readahead hints. + return 0, nil, syserror.EINVAL +} diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 09ecfed26..1ee37e5a8 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -65,7 +66,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ @@ -150,7 +151,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var opts vfs.SetStatOptions if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { @@ -178,6 +179,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc Mask: linux.STATX_SIZE, Size: uint64(length), }, + NeedWritePerm: true, }) return 0, nil, handleSetSizeError(t, err) } @@ -195,7 +197,11 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) + + if !file.IsWritable() { + return 0, nil, syserror.EINVAL + } err := file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ @@ -206,6 +212,56 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, handleSetSizeError(t, err) } +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef(t) + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + if err := file.Allocate(t, mode, uint64(offset), uint64(length)); err != nil { + return 0, nil, err + } + + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) + return 0, nil, nil +} + // Utime implements Linux syscall utime(2). func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() @@ -290,7 +346,7 @@ func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opt return nil } var times [2]linux.Timeval - if _, err := t.CopyIn(timesAddr, ×); err != nil { + if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil { return err } if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { @@ -354,7 +410,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op return nil } var times [2]linux.Timespec - if _, err := t.CopyIn(timesAddr, ×); err != nil { + if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil { return err } if times[0].Nsec != linux.UTIME_OMIT { @@ -382,7 +438,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { @@ -390,7 +446,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -401,13 +457,13 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa // VirtualFilesystem.SetStatAt(), since the former may be able // to use opened file state to expedite the SetStat. err := dirfile.SetStat(t, *opts) - dirfile.DecRef() + dirfile.DecRef(t) return err } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go index 623992f6f..b89f34cdb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/signal.go +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -45,7 +45,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { @@ -68,7 +68,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Create a new descriptor. fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 10b668477..7b33b3f59 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -19,6 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -64,10 +66,10 @@ const flagsOffset = 48 const sizeOfInt32 = 4 // messageHeader64Len is the length of a MessageHeader64 struct. -var messageHeader64Len = uint64(binary.Size(MessageHeader64{})) +var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes()) // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. -var multipleMessageHeader64Len = uint64(binary.Size(multipleMessageHeader64{})) +var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes()) // baseRecvFlags are the flags that are accepted across recvmsg(2), // recvmmsg(2), and recvfrom(2). @@ -75,6 +77,8 @@ const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | // MessageHeader64 is the 64-bit representation of the msghdr struct used in // the recvmsg and sendmsg syscalls. +// +// +marshal type MessageHeader64 struct { // Name is the optional pointer to a network address buffer. Name uint64 @@ -103,30 +107,14 @@ type MessageHeader64 struct { // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in // the recvmmsg and sendmmsg syscalls. +// +// +marshal type multipleMessageHeader64 struct { msgHdr MessageHeader64 msgLen uint32 _ int32 } -// CopyInMessageHeader64 copies a message header from user to kernel memory. -func CopyInMessageHeader64(t *kernel.Task, addr usermem.Addr, msg *MessageHeader64) error { - b := t.CopyScratchBuffer(52) - if _, err := t.CopyInBytes(addr, b); err != nil { - return err - } - - msg.Name = usermem.ByteOrder.Uint64(b[0:]) - msg.NameLen = usermem.ByteOrder.Uint32(b[8:]) - msg.Iov = usermem.ByteOrder.Uint64(b[16:]) - msg.IovLen = usermem.ByteOrder.Uint64(b[24:]) - msg.Control = usermem.ByteOrder.Uint64(b[32:]) - msg.ControlLen = usermem.ByteOrder.Uint64(b[40:]) - msg.Flags = int32(usermem.ByteOrder.Uint32(b[48:])) - - return nil -} - // CaptureAddress allocates memory for and copies a socket address structure // from the untrusted address space range. func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, error) { @@ -145,10 +133,10 @@ func CaptureAddress(t *kernel.Task, addr usermem.Addr, addrlen uint32) ([]byte, // writeAddress writes a sockaddr structure and its length to an output buffer // in the unstrusted address space range. If the address is bigger than the // buffer, it is truncated. -func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { +func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr usermem.Addr, addrLenPtr usermem.Addr) error { // Get the buffer length. var bufLen uint32 - if _, err := t.CopyIn(addrLenPtr, &bufLen); err != nil { + if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil { return err } @@ -157,7 +145,7 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } // Write the length unconditionally. - if _, err := t.CopyOut(addrLenPtr, addrLen); err != nil { + if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil { return err } @@ -170,7 +158,8 @@ func writeAddress(t *kernel.Task, addr interface{}, addrLen uint32, addrPtr user } // Copy as much of the address as will fit in the buffer. - encodedAddr := binary.Marshal(nil, usermem.ByteOrder, addr) + encodedAddr := t.CopyScratchBuffer(addr.SizeBytes()) + addr.MarshalUnsafe(encodedAddr) if bufLen > uint32(len(encodedAddr)) { bufLen = uint32(len(encodedAddr)) } @@ -194,7 +183,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if e != nil { return 0, nil, e.ToError() } - defer s.DecRef() + defer s.DecRef(t) if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, err @@ -228,8 +217,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, e.ToError() } // Adding to the FD table will cause an extra reference to be acquired. - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) nonblocking := uint32(stype & linux.SOCK_NONBLOCK) if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { @@ -248,10 +237,10 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, err } - if _, err := t.CopyOut(addr, fds); err != nil { + if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil { for _, fd := range fds { - if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + if _, file := t.FDTable().Remove(t, fd); file != nil { + file.DecRef(t) } } return 0, nil, err @@ -271,7 +260,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -286,7 +275,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 - return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), kernel.ERESTARTSYS) + return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS) } // accept is the implementation of the accept syscall. It is called by accept @@ -302,7 +291,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -317,7 +306,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f peerRequested := addrLen != 0 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) } if peerRequested { // NOTE(magi): Linux does not give you an error if it can't @@ -361,7 +350,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -388,7 +377,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -417,7 +406,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -448,7 +437,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -457,8 +446,8 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } // Read the length. Reject negative values. - optLen := int32(0) - if _, err := t.CopyIn(optLenAddr, &optLen); err != nil { + var optLen int32 + if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil { return 0, nil, err } if optLen < 0 { @@ -472,12 +461,12 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } vLen := int32(binary.Size(v)) - if _, err := t.CopyOut(optLenAddr, vLen); err != nil { + if _, err := primitive.CopyInt32Out(t, optLenAddr, vLen); err != nil { return 0, nil, err } if v != nil { - if _, err := t.CopyOut(optValAddr, v); err != nil { + if _, err := v.CopyOut(t, optValAddr); err != nil { return 0, nil, err } } @@ -487,7 +476,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy // getSockOpt tries to handle common socket options, or dispatches to a specific // socket implementation. -func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) { +func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr usermem.Addr, len int) (marshal.Marshallable, *syserr.Error) { if level == linux.SOL_SOCKET { switch name { case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: @@ -499,13 +488,16 @@ func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr switch name { case linux.SO_TYPE: _, skType, _ := s.Type() - return int32(skType), nil + v := primitive.Int32(skType) + return &v, nil case linux.SO_DOMAIN: family, _, _ := s.Type() - return int32(family), nil + v := primitive.Int32(family) + return &v, nil case linux.SO_PROTOCOL: _, _, protocol := s.Type() - return int32(protocol), nil + v := primitive.Int32(protocol) + return &v, nil } } @@ -527,7 +519,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -542,7 +534,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, syserror.EINVAL } buf := t.CopyScratchBuffer(int(optLen)) - if _, err := t.CopyIn(optValAddr, &buf); err != nil { + if _, err := t.CopyInBytes(optValAddr, buf); err != nil { return 0, nil, err } @@ -565,7 +557,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -593,7 +585,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -626,7 +618,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -679,7 +671,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -731,7 +723,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if !ok { return 0, nil, syserror.EFAULT } - if _, err = t.CopyOut(lp, uint32(n)); err != nil { + if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break } count++ @@ -746,7 +738,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { // Capture the message header and io vectors. var msg MessageHeader64 - if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + if _, err := msg.CopyIn(t, msgPtr); err != nil { return 0, err } @@ -769,16 +761,16 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla if msg.ControlLen == 0 && msg.NameLen == 0 { n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) if err != nil { - return 0, syserror.ConvertIntr(err.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS) } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { // Copy out the flags to the caller. - if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { return 0, err } } @@ -791,9 +783,9 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -815,17 +807,17 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } // Copy the control data to the caller. - if _, err := t.CopyOut(msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { + if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { return 0, err } if len(controlData) > 0 { - if _, err := t.CopyOut(usermem.Addr(msg.Control), controlData); err != nil { + if _, err := t.CopyOutBytes(usermem.Addr(msg.Control), controlData); err != nil { return 0, err } } // Copy out the flags to the caller. - if _, err := t.CopyOut(msgPtr+flagsOffset, int32(mflags)); err != nil { + if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { return 0, err } @@ -849,7 +841,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -878,9 +870,9 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { - return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) + return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS) } // Copy the address to the caller. @@ -922,7 +914,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -960,7 +952,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -994,7 +986,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if !ok { return 0, nil, syserror.EFAULT } - if _, err = t.CopyOut(lp, uint32(n)); err != nil { + if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { break } count++ @@ -1009,7 +1001,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr usermem.Addr, flags int32) (uintptr, error) { // Capture the message header. var msg MessageHeader64 - if err := CopyInMessageHeader64(t, msgPtr, &msg); err != nil { + if _, err := msg.CopyIn(t, msgPtr); err != nil { return 0, err } @@ -1020,7 +1012,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio return 0, syserror.ENOBUFS } controlData = make([]byte, msg.ControlLen) - if _, err := t.CopyIn(usermem.Addr(msg.Control), &controlData); err != nil { + if _, err := t.CopyInBytes(usermem.Addr(msg.Control), controlData); err != nil { return 0, err } } @@ -1062,9 +1054,11 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) - err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) - if err != nil { - controlMessages.Release() + err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) + // Control messages should be released on error as well as for zero-length + // messages, which are discarded by the receiver. + if n == 0 || err != nil { + controlMessages.Release(t) } return uintptr(n), err } @@ -1082,7 +1076,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -1122,7 +1116,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)}) - return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendto", file) + return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file) } // SendTo implements the linux syscall sendto(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 945a364a7..035e2a6b0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -15,12 +15,18 @@ package vfs2 import ( + "io" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -39,6 +45,9 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } + if count < 0 { + return 0, nil, syserror.EINVAL + } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { @@ -50,12 +59,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -85,7 +94,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if inFile.Options().DenyPRead { return 0, nil, syserror.EINVAL } - if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil { + if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil { return 0, nil, err } if inOffset < 0 { @@ -100,7 +109,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if outFile.Options().DenyPWrite { return 0, nil, syserror.EINVAL } - if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil { + if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil { return 0, nil, err } if outOffset < 0 { @@ -110,89 +119,67 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // Move data. var ( - n int64 - err error - inCh chan struct{} - outCh chan struct{} + n int64 + err error ) + dw := dualWaiter{ + inFile: inFile, + outFile: outFile, + } + defer dw.destroy() for { // If both input and output are pipes, delegate to the pipe - // implementation. Otherwise, exactly one end is a pipe, which we - // ensure is consistently ordered after the non-pipe FD's locks by - // passing the pipe FD as usermem.IO to the non-pipe end. + // implementation. Otherwise, exactly one end is a pipe, which + // we ensure is consistently ordered after the non-pipe FD's + // locks by passing the pipe FD as usermem.IO to the non-pipe + // end. switch { case inIsPipe && outIsPipe: n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) case inIsPipe: + n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count) if outOffset != -1 { - n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{}) outOffset += n - } else { - n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{}) } case outIsPipe: + n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count) if inOffset != -1 { - n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{}) inOffset += n - } else { - n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) } + default: + panic("at least one end of splice must be a pipe") } + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { break } - - // Note that the blocking behavior here is a bit different than the - // normal pattern. Because we need to have both data to read and data - // to write simultaneously, we actually explicitly block on both of - // these cases in turn before returning to the splice operation. - if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { - if inCh == nil { - inCh = make(chan struct{}, 1) - inW, _ := waiter.NewChannelEntry(inCh) - inFile.EventRegister(&inW, eventMaskRead) - defer inFile.EventUnregister(&inW) - continue // Need to refresh readiness. - } - if err = t.Block(inCh); err != nil { - break - } - } - if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { - if outCh == nil { - outCh = make(chan struct{}, 1) - outW, _ := waiter.NewChannelEntry(outCh) - outFile.EventRegister(&outW, eventMaskWrite) - defer outFile.EventUnregister(&outW) - continue // Need to refresh readiness. - } - if err = t.Block(outCh); err != nil { - break - } + if err = dw.waitForBoth(t); err != nil { + break } } // Copy updated offsets out. if inOffsetPtr != 0 { - if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil { + if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil { return 0, nil, err } } if outOffsetPtr != 0 { - if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil { + if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil { return 0, nil, err } } - if n == 0 { - return 0, nil, err + if n != 0 { + // On Linux, inotify behavior is not very consistent with splice(2). We try + // our best to emulate Linux for very basic calls to splice, where for some + // reason, events are generated for output files, but not input files. + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } - // On Linux, inotify behavior is not very consistent with splice(2). We try - // our best to emulate Linux for very basic calls to splice, where for some - // reason, events are generated for output files, but not input files. - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) - return uintptr(n), nil, nil + // We can only pass a single file to handleIOError, so pick inFile arbitrarily. + // This is used only for debugging purposes. + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile) } // Tee implements Linux syscall tee(2). @@ -208,6 +195,9 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } + if count < 0 { + return 0, nil, syserror.EINVAL + } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { @@ -219,12 +209,12 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -247,45 +237,274 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo // Copy data. var ( - inCh chan struct{} - outCh chan struct{} + n int64 + err error ) + dw := dualWaiter{ + inFile: inFile, + outFile: outFile, + } + defer dw.destroy() for { - n, err := pipe.Tee(t, outPipeFD, inPipeFD, count) - if n != 0 { - return uintptr(n), nil, nil + n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + break + } + if err = dw.waitForBoth(t); err != nil { + break + } + } + + if n != 0 { + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) + + // If a partial write is completed, the error is dropped. Log it here. + if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { + log.Debugf("tee completed a partial write with error: %v", err) + err = nil } - if err != syserror.ErrWouldBlock || nonBlock { + } + + // We can only pass a single file to handleIOError, so pick inFile arbitrarily. + // This is used only for debugging purposes. + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile) +} + +// Sendfile implements linux system call sendfile(2). +func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + outFD := args[0].Int() + inFD := args[1].Int() + offsetAddr := args[2].Pointer() + count := int64(args[3].SizeT()) + + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef(t) + if !inFile.IsReadable() { + return 0, nil, syserror.EBADF + } + + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef(t) + if !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // Verify that the outFile Append flag is not set. + if outFile.StatusFlags()&linux.O_APPEND != 0 { + return 0, nil, syserror.EINVAL + } + + // Verify that inFile is a regular file or block device. This is a + // requirement; the same check appears in Linux + // (fs/splice.c:splice_direct_to_actor). + if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil { + return 0, nil, err + } else if stat.Mask&linux.STATX_TYPE == 0 || + (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { + return 0, nil, syserror.EINVAL + } + + // Copy offset if it exists. + offset := int64(-1) + if offsetAddr != 0 { + if inFile.Options().DenyPRead { + return 0, nil, syserror.ESPIPE + } + var offsetP primitive.Int64 + if _, err := offsetP.CopyIn(t, offsetAddr); err != nil { return 0, nil, err } + offset = int64(offsetP) + + if offset < 0 { + return 0, nil, syserror.EINVAL + } + if offset+count < 0 { + return 0, nil, syserror.EINVAL + } + } + + // Validate count. This must come after offset checks. + if count < 0 { + return 0, nil, syserror.EINVAL + } + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } - // Note that the blocking behavior here is a bit different than the - // normal pattern. Because we need to have both data to read and data - // to write simultaneously, we actually explicitly block on both of - // these cases in turn before returning to the tee operation. - if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { - if inCh == nil { - inCh = make(chan struct{}, 1) - inW, _ := waiter.NewChannelEntry(inCh) - inFile.EventRegister(&inW, eventMaskRead) - defer inFile.EventUnregister(&inW) - continue // Need to refresh readiness. + // Copy data. + var ( + n int64 + err error + ) + dw := dualWaiter{ + inFile: inFile, + outFile: outFile, + } + defer dw.destroy() + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + // Reading from input file should never block, since it is regular or + // block device. We only need to check if writing to the output file + // can block. + nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0 + if outIsPipe { + for n < count { + var spliceN int64 + spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count) + if offset != -1 { + offset += spliceN } - if err := t.Block(inCh); err != nil { - return 0, nil, err + n += spliceN + if err == syserror.ErrWouldBlock && !nonBlock { + err = dw.waitForBoth(t) + } + if err != nil { + break } } - if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { - if outCh == nil { - outCh = make(chan struct{}, 1) - outW, _ := waiter.NewChannelEntry(outCh) - outFile.EventRegister(&outW, eventMaskWrite) - defer outFile.EventUnregister(&outW) - continue // Need to refresh readiness. + } else { + // Read inFile to buffer, then write the contents to outFile. + buf := make([]byte, count) + for n < count { + var readN int64 + if offset != -1 { + readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{}) + offset += readN + } else { + readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) } - if err := t.Block(outCh); err != nil { - return 0, nil, err + n += readN + + // Write all of the bytes that we read. This may need + // multiple write calls to complete. + wbuf := buf[:readN] + for len(wbuf) > 0 { + var writeN int64 + writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) + wbuf = wbuf[writeN:] + if err == syserror.ErrWouldBlock && !nonBlock { + err = dw.waitForOut(t) + } + if err != nil { + // We didn't complete the write. Only report the bytes that were actually + // written, and rewind offsets as needed. + notWritten := int64(len(wbuf)) + n -= notWritten + if offset == -1 { + // We modified the offset of the input file itself during the read + // operation. Rewind it. + if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil { + // Log the error but don't return it, since the write has already + // completed successfully. + log.Warningf("failed to roll back input file offset: %v", seekErr) + } + } else { + // The sendfile call was provided an offset parameter that should be + // adjusted to reflect the number of bytes sent. Rewind it. + offset -= notWritten + } + break + } + } + if err == syserror.ErrWouldBlock && !nonBlock { + err = dw.waitForBoth(t) + } + if err != nil { + break } } } + + if offsetAddr != 0 { + // Copy out the new offset. + offsetP := primitive.Uint64(offset) + if _, err := offsetP.CopyOut(t, offsetAddr); err != nil { + return 0, nil, err + } + } + + if n != 0 { + inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) + + if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { + // If a partial write is completed, the error is dropped. Log it here. + log.Debugf("sendfile completed a partial write with error: %v", err) + err = nil + } + } + + // We can only pass a single file to handleIOError, so pick inFile arbitrarily. + // This is used only for debugging purposes. + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile) +} + +// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not +// thread-safe, and does not take a reference on the vfs.FileDescriptions. +// +// Users must call destroy() when finished. +type dualWaiter struct { + inFile *vfs.FileDescription + outFile *vfs.FileDescription + + inW waiter.Entry + inCh chan struct{} + outW waiter.Entry + outCh chan struct{} +} + +// waitForBoth waits for both dw.inFile and dw.outFile to be ready. +func (dw *dualWaiter) waitForBoth(t *kernel.Task) error { + if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if dw.inCh == nil { + dw.inW, dw.inCh = waiter.NewChannelEntry(nil) + dw.inFile.EventRegister(&dw.inW, eventMaskRead) + // We might be ready now. Try again before blocking. + return nil + } + if err := t.Block(dw.inCh); err != nil { + return err + } + } + return dw.waitForOut(t) +} + +// waitForOut waits for dw.outfile to be read. +func (dw *dualWaiter) waitForOut(t *kernel.Task) error { + if dw.outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if dw.outCh == nil { + dw.outW, dw.outCh = waiter.NewChannelEntry(nil) + dw.outFile.EventRegister(&dw.outW, eventMaskWrite) + // We might be ready now. Try again before blocking. + return nil + } + if err := t.Block(dw.outCh); err != nil { + return err + } + } + return nil +} + +// destroy cleans up resources help by dw. No more calls to wait* can occur +// after destroy is called. +func (dw *dualWaiter) destroy() { + if dw.inCh != nil { + dw.inFile.EventUnregister(&dw.inW) + dw.inCh = nil + } + if dw.outCh != nil { + dw.outFile.EventUnregister(&dw.outW) + dw.outCh = nil + } + dw.inFile = nil + dw.outFile = nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index bb1d5cac4..0f5d5189c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -65,7 +65,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -73,7 +73,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -85,7 +85,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return err } @@ -96,8 +96,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -132,7 +132,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) statx, err := file.Stat(t, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, @@ -177,7 +177,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -185,7 +185,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -197,7 +197,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return 0, nil, err } @@ -207,8 +207,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -282,7 +282,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. @@ -328,7 +328,7 @@ func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, siz if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -358,7 +358,7 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -377,7 +377,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 365250b0b..6e9b599e2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -34,7 +34,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SyncFS(t) } @@ -47,7 +47,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.Sync(t) } @@ -65,10 +65,8 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel nbytes := args[2].Int64() flags := args[3].Uint() - if offset < 0 { - return 0, nil, syserror.EINVAL - } - if nbytes < 0 { + // Check for negative values and overflow. + if offset < 0 || offset+nbytes < 0 { return 0, nil, syserror.EINVAL } if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { @@ -79,9 +77,39 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) - // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of - // [offset, offset+nbytes). - return 0, nil, file.Sync(t) + // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support + // is a full-file sync, i.e. fsync(2). As a result, there are severe + // limitations on how much we support sync_file_range: + // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even + // if the file size is changed. We do. + // - We always sync the entire file instead of [offset, offset+nbytes). + // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For + // correctness, we would have to perform a write-out every time WAIT_BEFORE + // was used, but this would be much more expensive than expected if there + // were no write-out operations in progress. + // - Whenever WAIT_AFTER is used, we sync the file. + // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will + // be synced anyway. If this flag is used without WAIT_AFTER, then it is + // safe (and less expensive) to do nothing, because the syscall will not + // wait for the write-out to complete--we only need to make sure that the + // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes. + // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O + // errors or ENOSPC conditions and will return those to the caller, after + // clearing the EIO and ENOSPC flags in the address_space." We don't do + // this. + + if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && + flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { + if err := file.Sync(t); err != nil { + return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS) + } + } + return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 5ac79bc09..250870c03 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -50,11 +50,11 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } vfsObj := t.Kernel().VFS() - file, err := timerfd.New(vfsObj, clock, fileFlags) + file, err := timerfd.New(t, vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, }) @@ -79,7 +79,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { @@ -87,7 +87,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } var newVal linux.Itimerspec - if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + if _, err := newVal.CopyIn(t, newValAddr); err != nil { return 0, nil, err } newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock()) @@ -97,7 +97,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne tm, oldS := tfd.SetTime(newS) if oldValAddr != 0 { oldVal := ktime.ItimerspecFromSetting(tm, oldS) - if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + if _, err := oldVal.CopyOut(t, oldValAddr); err != nil { return 0, nil, err } } @@ -113,7 +113,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { @@ -122,6 +122,6 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne tm, s := tfd.GetTime() curVal := ktime.ItimerspecFromSetting(tm, s) - _, err := t.CopyOut(curValAddr, &curVal) + _, err := curVal.CopyOut(t, curValAddr) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index ef8358b8a..c50fd97eb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -44,7 +44,7 @@ func Override() { s.Table[23] = syscalls.Supported("select", Select) s.Table[32] = syscalls.Supported("dup", Dup) s.Table[33] = syscalls.Supported("dup2", Dup2) - delete(s.Table, 40) // sendfile + s.Table[40] = syscalls.Supported("sendfile", Sendfile) s.Table[41] = syscalls.Supported("socket", Socket) s.Table[42] = syscalls.Supported("connect", Connect) s.Table[43] = syscalls.Supported("accept", Accept) @@ -62,7 +62,7 @@ func Override() { s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) s.Table[59] = syscalls.Supported("execve", Execve) s.Table[72] = syscalls.Supported("fcntl", Fcntl) - delete(s.Table, 73) // flock + s.Table[73] = syscalls.Supported("flock", Flock) s.Table[74] = syscalls.Supported("fsync", Fsync) s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) s.Table[76] = syscalls.Supported("truncate", Truncate) @@ -90,29 +90,25 @@ func Override() { s.Table[138] = syscalls.Supported("fstatfs", Fstatfs) s.Table[161] = syscalls.Supported("chroot", Chroot) s.Table[162] = syscalls.Supported("sync", Sync) - delete(s.Table, 165) // mount - delete(s.Table, 166) // umount2 - delete(s.Table, 187) // readahead - s.Table[188] = syscalls.Supported("setxattr", Setxattr) + s.Table[165] = syscalls.Supported("mount", Mount) + s.Table[166] = syscalls.Supported("umount2", Umount2) + s.Table[187] = syscalls.Supported("readahead", Readahead) + s.Table[188] = syscalls.Supported("setxattr", SetXattr) s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr) s.Table[190] = syscalls.Supported("fsetxattr", Fsetxattr) - s.Table[191] = syscalls.Supported("getxattr", Getxattr) + s.Table[191] = syscalls.Supported("getxattr", GetXattr) s.Table[192] = syscalls.Supported("lgetxattr", Lgetxattr) s.Table[193] = syscalls.Supported("fgetxattr", Fgetxattr) - s.Table[194] = syscalls.Supported("listxattr", Listxattr) + s.Table[194] = syscalls.Supported("listxattr", ListXattr) s.Table[195] = syscalls.Supported("llistxattr", Llistxattr) s.Table[196] = syscalls.Supported("flistxattr", Flistxattr) - s.Table[197] = syscalls.Supported("removexattr", Removexattr) + s.Table[197] = syscalls.Supported("removexattr", RemoveXattr) s.Table[198] = syscalls.Supported("lremovexattr", Lremovexattr) s.Table[199] = syscalls.Supported("fremovexattr", Fremovexattr) - delete(s.Table, 206) // io_setup - delete(s.Table, 207) // io_destroy - delete(s.Table, 208) // io_getevents - delete(s.Table, 209) // io_submit - delete(s.Table, 210) // io_cancel + s.Table[209] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}) s.Table[213] = syscalls.Supported("epoll_create", EpollCreate) s.Table[217] = syscalls.Supported("getdents64", Getdents64) - delete(s.Table, 221) // fdavise64 + s.Table[221] = syscalls.PartiallySupported("fadvise64", Fadvise64, "The syscall is 'supported', but ignores all provided advice.", nil) s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) s.Table[235] = syscalls.Supported("utimes", Utimes) @@ -142,7 +138,7 @@ func Override() { s.Table[282] = syscalls.Supported("signalfd", Signalfd) s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) s.Table[284] = syscalls.Supported("eventfd", Eventfd) - delete(s.Table, 285) // fallocate + s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil) s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[288] = syscalls.Supported("accept4", Accept4) @@ -167,6 +163,112 @@ func Override() { // Override ARM64. s = linux.ARM64 + s.Table[2] = syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}) + s.Table[5] = syscalls.Supported("setxattr", SetXattr) + s.Table[6] = syscalls.Supported("lsetxattr", Lsetxattr) + s.Table[7] = syscalls.Supported("fsetxattr", Fsetxattr) + s.Table[8] = syscalls.Supported("getxattr", GetXattr) + s.Table[9] = syscalls.Supported("lgetxattr", Lgetxattr) + s.Table[10] = syscalls.Supported("fgetxattr", Fgetxattr) + s.Table[11] = syscalls.Supported("listxattr", ListXattr) + s.Table[12] = syscalls.Supported("llistxattr", Llistxattr) + s.Table[13] = syscalls.Supported("flistxattr", Flistxattr) + s.Table[14] = syscalls.Supported("removexattr", RemoveXattr) + s.Table[15] = syscalls.Supported("lremovexattr", Lremovexattr) + s.Table[16] = syscalls.Supported("fremovexattr", Fremovexattr) + s.Table[17] = syscalls.Supported("getcwd", Getcwd) + s.Table[19] = syscalls.Supported("eventfd2", Eventfd2) + s.Table[20] = syscalls.Supported("epoll_create1", EpollCreate1) + s.Table[21] = syscalls.Supported("epoll_ctl", EpollCtl) + s.Table[22] = syscalls.Supported("epoll_pwait", EpollPwait) + s.Table[23] = syscalls.Supported("dup", Dup) + s.Table[24] = syscalls.Supported("dup3", Dup3) + s.Table[25] = syscalls.Supported("fcntl", Fcntl) + s.Table[26] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil) + s.Table[27] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[28] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[29] = syscalls.Supported("ioctl", Ioctl) + s.Table[32] = syscalls.Supported("flock", Flock) + s.Table[33] = syscalls.Supported("mknodat", Mknodat) + s.Table[34] = syscalls.Supported("mkdirat", Mkdirat) + s.Table[35] = syscalls.Supported("unlinkat", Unlinkat) + s.Table[36] = syscalls.Supported("symlinkat", Symlinkat) + s.Table[37] = syscalls.Supported("linkat", Linkat) + s.Table[38] = syscalls.Supported("renameat", Renameat) + s.Table[39] = syscalls.Supported("umount2", Umount2) + s.Table[40] = syscalls.Supported("mount", Mount) + s.Table[43] = syscalls.Supported("statfs", Statfs) + s.Table[44] = syscalls.Supported("fstatfs", Fstatfs) + s.Table[45] = syscalls.Supported("truncate", Truncate) + s.Table[46] = syscalls.Supported("ftruncate", Ftruncate) + s.Table[47] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil) + s.Table[48] = syscalls.Supported("faccessat", Faccessat) + s.Table[49] = syscalls.Supported("chdir", Chdir) + s.Table[50] = syscalls.Supported("fchdir", Fchdir) + s.Table[51] = syscalls.Supported("chroot", Chroot) + s.Table[52] = syscalls.Supported("fchmod", Fchmod) + s.Table[53] = syscalls.Supported("fchmodat", Fchmodat) + s.Table[54] = syscalls.Supported("fchownat", Fchownat) + s.Table[55] = syscalls.Supported("fchown", Fchown) + s.Table[56] = syscalls.Supported("openat", Openat) + s.Table[57] = syscalls.Supported("close", Close) + s.Table[59] = syscalls.Supported("pipe2", Pipe2) + s.Table[61] = syscalls.Supported("getdents64", Getdents64) + s.Table[62] = syscalls.Supported("lseek", Lseek) s.Table[63] = syscalls.Supported("read", Read) + s.Table[64] = syscalls.Supported("write", Write) + s.Table[65] = syscalls.Supported("readv", Readv) + s.Table[66] = syscalls.Supported("writev", Writev) + s.Table[67] = syscalls.Supported("pread64", Pread64) + s.Table[68] = syscalls.Supported("pwrite64", Pwrite64) + s.Table[69] = syscalls.Supported("preadv", Preadv) + s.Table[70] = syscalls.Supported("pwritev", Pwritev) + s.Table[71] = syscalls.Supported("sendfile", Sendfile) + s.Table[72] = syscalls.Supported("pselect", Pselect) + s.Table[73] = syscalls.Supported("ppoll", Ppoll) + s.Table[74] = syscalls.Supported("signalfd4", Signalfd4) + s.Table[76] = syscalls.Supported("splice", Splice) + s.Table[77] = syscalls.Supported("tee", Tee) + s.Table[78] = syscalls.Supported("readlinkat", Readlinkat) + s.Table[79] = syscalls.Supported("newfstatat", Newfstatat) + s.Table[80] = syscalls.Supported("fstat", Fstat) + s.Table[81] = syscalls.Supported("sync", Sync) + s.Table[82] = syscalls.Supported("fsync", Fsync) + s.Table[83] = syscalls.Supported("fdatasync", Fdatasync) + s.Table[84] = syscalls.Supported("sync_file_range", SyncFileRange) + s.Table[85] = syscalls.Supported("timerfd_create", TimerfdCreate) + s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime) + s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime) + s.Table[88] = syscalls.Supported("utimensat", Utimensat) + s.Table[198] = syscalls.Supported("socket", Socket) + s.Table[199] = syscalls.Supported("socketpair", SocketPair) + s.Table[200] = syscalls.Supported("bind", Bind) + s.Table[201] = syscalls.Supported("listen", Listen) + s.Table[202] = syscalls.Supported("accept", Accept) + s.Table[203] = syscalls.Supported("connect", Connect) + s.Table[204] = syscalls.Supported("getsockname", GetSockName) + s.Table[205] = syscalls.Supported("getpeername", GetPeerName) + s.Table[206] = syscalls.Supported("sendto", SendTo) + s.Table[207] = syscalls.Supported("recvfrom", RecvFrom) + s.Table[208] = syscalls.Supported("setsockopt", SetSockOpt) + s.Table[209] = syscalls.Supported("getsockopt", GetSockOpt) + s.Table[210] = syscalls.Supported("shutdown", Shutdown) + s.Table[211] = syscalls.Supported("sendmsg", SendMsg) + s.Table[212] = syscalls.Supported("recvmsg", RecvMsg) + s.Table[213] = syscalls.Supported("readahead", Readahead) + s.Table[221] = syscalls.Supported("execve", Execve) + s.Table[222] = syscalls.Supported("mmap", Mmap) + s.Table[223] = syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil) + s.Table[242] = syscalls.Supported("accept4", Accept4) + s.Table[243] = syscalls.Supported("recvmmsg", RecvMMsg) + s.Table[267] = syscalls.Supported("syncfs", Syncfs) + s.Table[269] = syscalls.Supported("sendmmsg", SendMMsg) + s.Table[276] = syscalls.Supported("renameat2", Renameat2) + s.Table[279] = syscalls.Supported("memfd_create", MemfdCreate) + s.Table[281] = syscalls.Supported("execveat", Execveat) + s.Table[286] = syscalls.Supported("preadv2", Preadv2) + s.Table[287] = syscalls.Supported("pwritev2", Pwritev2) + s.Table[291] = syscalls.Supported("statx", Statx) + s.Init() } diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index af455d5c1..e05723ef9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -26,8 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// Listxattr implements Linux syscall listxattr(2). -func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// ListXattr implements Linux syscall listxattr(2). +func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return listxattr(t, args, followFinalSymlink) } @@ -49,9 +49,9 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) - names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) + names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) if err != nil { return 0, nil, err } @@ -72,9 +72,9 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) - names, err := file.Listxattr(t, uint64(size)) + names, err := file.ListXattr(t, uint64(size)) if err != nil { return 0, nil, err } @@ -85,8 +85,8 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return uintptr(n), nil, nil } -// Getxattr implements Linux syscall getxattr(2). -func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// GetXattr implements Linux syscall getxattr(2). +func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getxattr(t, args, followFinalSymlink) } @@ -109,14 +109,14 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } - value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{ + value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{ Name: name, Size: uint64(size), }) @@ -141,14 +141,14 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } - value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)}) + value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)}) if err != nil { return 0, nil, err } @@ -159,8 +159,8 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return uintptr(n), nil, nil } -// Setxattr implements Linux syscall setxattr(2). -func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// SetXattr implements Linux syscall setxattr(2). +func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, setxattr(t, args, followFinalSymlink) } @@ -188,7 +188,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -199,7 +199,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli return err } - return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{ + return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{ Name: name, Value: value, Flags: uint32(flags), @@ -222,7 +222,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -233,15 +233,15 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } - return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{ + return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{ Name: name, Value: value, Flags: uint32(flags), }) } -// Removexattr implements Linux syscall removexattr(2). -func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// RemoveXattr implements Linux syscall removexattr(2). +func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, removexattr(t, args, followFinalSymlink) } @@ -262,14 +262,14 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return err } - return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name) + return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name) } // Fremovexattr implements Linux syscall fremovexattr(2). @@ -281,14 +281,14 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { return 0, nil, err } - return 0, nil, file.Removexattr(t, name) + return 0, nil, file.RemoveXattr(t, name) } func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { diff --git a/pkg/sentry/time/muldiv_arm64.s b/pkg/sentry/time/muldiv_arm64.s index 5ad57a8a3..8afc62d53 100644 --- a/pkg/sentry/time/muldiv_arm64.s +++ b/pkg/sentry/time/muldiv_arm64.s @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "funcdata.h" #include "textflag.h" // Documentation is available in parameters.go. // // func muldiv64(value, multiplier, divisor uint64) (uint64, bool) TEXT ·muldiv64(SB),NOSPLIT,$40-33 + GO_ARGS + NO_LOCAL_POINTERS MOVD value+0(FP), R0 MOVD multiplier+8(FP), R1 MOVD divisor+16(FP), R2 diff --git a/pkg/sentry/time/parameters.go b/pkg/sentry/time/parameters.go index 65868cb26..cd1b95117 100644 --- a/pkg/sentry/time/parameters.go +++ b/pkg/sentry/time/parameters.go @@ -228,11 +228,15 @@ func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Par // // The log level is determined by the error severity. func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) { - fn := log.Debugf - if int64(errorNS.Magnitude()) > time.Millisecond.Nanoseconds() { + magNS := int64(errorNS.Magnitude()) + if magNS <= 10*time.Microsecond.Nanoseconds() { + // Don't log small errors. + return + } + fn := log.Infof + if magNS > time.Millisecond.Nanoseconds() { + // Upgrade large errors to warning. fn = log.Warningf - } else if int64(errorNS.Magnitude()) > 10*time.Microsecond.Nanoseconds() { - fn = log.Infof } fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency) diff --git a/pkg/sentry/time/parameters_test.go b/pkg/sentry/time/parameters_test.go index e1b9084ac..0ce1257f6 100644 --- a/pkg/sentry/time/parameters_test.go +++ b/pkg/sentry/time/parameters_test.go @@ -484,3 +484,18 @@ func TestMulDivOverflow(t *testing.T) { }) } } + +func BenchmarkMuldiv64(b *testing.B) { + var v uint64 = math.MaxUint64 + for i := uint64(1); i <= 1000000; i++ { + mult := uint64(1000000000) + div := i * mult + res, ok := muldiv64(v, mult, div) + if !ok { + b.Errorf("Result of %v * %v / %v ok got false want true", v, mult, div) + } + if want := v / i; res != want { + b.Errorf("Bad result of %v * %v / %v: got %v, want %v", v, mult, div, res, want) + } + } +} diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index ab1d140d2..5ed6726ab 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -278,7 +278,7 @@ func TotalMemory(memSize, used uint64) uint64 { } if memSize < used { memSize = used - // Bump totalSize to the next largest power of 2, if one exists, so + // Bump memSize to the next largest power of 2, if one exists, so // that MemFree isn't 0. if msb := bits.MostSignificantOne64(memSize); msb < 63 { memSize = uint64(1) << (uint(msb) + 1) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 774cc66cc..440c9307c 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -27,6 +27,39 @@ go_template_instance( }, ) +go_template_instance( + name = "file_description_refs", + out = "file_description_refs.go", + package = "vfs", + prefix = "FileDescription", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "FileDescription", + }, +) + +go_template_instance( + name = "mount_namespace_refs", + out = "mount_namespace_refs.go", + package = "vfs", + prefix = "MountNamespace", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "MountNamespace", + }, +) + +go_template_instance( + name = "filesystem_refs", + out = "filesystem_refs.go", + package = "vfs", + prefix = "Filesystem", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "Filesystem", + }, +) + go_library( name = "vfs", srcs = [ @@ -40,28 +73,34 @@ go_library( "event_list.go", "file_description.go", "file_description_impl_util.go", + "file_description_refs.go", "filesystem.go", "filesystem_impl_util.go", + "filesystem_refs.go", "filesystem_type.go", "inotify.go", + "lock.go", "mount.go", + "mount_namespace_refs.go", "mount_unsafe.go", "options.go", "pathname.go", "permissions.go", "resolving_path.go", + "save_restore.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/fspath", "//pkg/gohacks", "//pkg/log", + "//pkg/refs", + "//pkg/refsvfs2", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 9aa133bcb..5aad31b78 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -39,8 +39,8 @@ Mount references are held by: - Mount: Each referenced Mount holds a reference on its parent, which is the mount containing its mount point. -- VirtualFilesystem: A reference is held on each Mount that has not been - umounted. +- VirtualFilesystem: A reference is held on each Mount that has been connected + to a mount point, but not yet umounted. MountNamespace and FileDescription references are held by users of VFS. The expectation is that each `kernel.Task` holds a reference on its corresponding @@ -169,8 +169,6 @@ This construction, which is essentially a type-safe analogue to Linux's - binder, which is similarly far too incomplete to use. - - whitelistfs, which we are already actively attempting to remove. - - Save/restore. For instance, it is unclear if the current implementation of the `state` package supports the inheritance pattern described above. @@ -186,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's - File locking - `O_ASYNC` - -- Reference counts in the `vfs` package do not use the `refs` package since - `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference - count, resulting in considerable cache bloat. 24 bytes of this overhead is - for weak reference support, which have poor performance and will not be used - by VFS2. The remaining 40 bytes is to store a descriptive string and stack - trace for reference leak checking; we can support reference leak checking - without incurring this space overhead by including the applicable - information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index b7c6b60b8..7ad0eaf86 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -52,6 +52,8 @@ const ( ) // anonFilesystemType implements FilesystemType. +// +// +stateify savable type anonFilesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. @@ -59,22 +61,28 @@ func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *au panic("cannot instaniate an anon filesystem") } -// Name implemenents FilesystemType.Name. +// Name implements FilesystemType.Name. func (anonFilesystemType) Name() string { return "none" } +// Release implemenents FilesystemType.Release. +func (anonFilesystemType) Release(ctx context.Context) {} + // anonFilesystem is the implementation of FilesystemImpl that backs // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). // // Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl // methods that would require an anonDentry to be a directory return ENOTDIR. +// +// +stateify savable type anonFilesystem struct { vfsfs Filesystem devMinor uint32 } +// +stateify savable type anonDentry struct { vfsd Dentry @@ -82,7 +90,7 @@ type anonDentry struct { } // Release implements FilesystemImpl.Release. -func (fs *anonFilesystem) Release() { +func (fs *anonFilesystem) Release(ctx context.Context) { } // Sync implements FilesystemImpl.Sync. @@ -245,32 +253,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath return nil, syserror.ECONNREFUSED } -// ListxattrAt implements FilesystemImpl.ListxattrAt. -func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements FilesystemImpl.ListXattrAt. +func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { if !rp.Done() { return nil, syserror.ENOTDIR } return nil, nil } -// GetxattrAt implements FilesystemImpl.GetxattrAt. -func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) { +// GetXattrAt implements FilesystemImpl.GetXattrAt. +func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) { if !rp.Done() { return "", syserror.ENOTDIR } return "", syserror.ENOTSUP } -// SetxattrAt implements FilesystemImpl.SetxattrAt. -func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error { +// SetXattrAt implements FilesystemImpl.SetXattrAt. +func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error { if !rp.Done() { return syserror.ENOTDIR } return syserror.EPERM } -// RemovexattrAt implements FilesystemImpl.RemovexattrAt. -func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error { +// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt. +func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error { if !rp.Done() { return syserror.ENOTDIR } @@ -294,18 +302,21 @@ func (d *anonDentry) TryIncRef() bool { } // DecRef implements DentryImpl.DecRef. -func (d *anonDentry) DecRef() { +func (d *anonDentry) DecRef(ctx context.Context) { // no-op } // InotifyWithParent implements DentryImpl.InotifyWithParent. // -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {} +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. +func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *anonDentry) Watches() *Watches { return nil } + +// OnZeroWatches implements Dentry.OnZeroWatches. +func (d *anonDentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index c9e724fef..97018651f 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace { return nil } +type mountNamespaceContext struct { + context.Context + mntns *MountNamespace +} + +// Value implements Context.Value. +func (mc mountNamespaceContext) Value(key interface{}) interface{} { + switch key { + case CtxMountNamespace: + mc.mntns.IncRef() + return mc.mntns + default: + return mc.Context.Value(key) + } +} + +// WithMountNamespace returns a copy of ctx with the given MountNamespace. +func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context { + return &mountNamespaceContext{ + Context: ctx, + mntns: mntns, + } +} + // RootFromContext returns the VFS root used by ctx. It takes a reference on // the returned VirtualDentry. If ctx does not have a specific VFS root, // RootFromContext returns a zero-value VirtualDentry. diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 24af13eb1..320ab7ce1 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -17,6 +17,7 @@ package vfs import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -88,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl { // DentryImpl contains implementation details for a Dentry. Implementations of // DentryImpl should contain their associated Dentry by value as their first // field. +// +// +stateify savable type DentryImpl interface { // IncRef increments the Dentry's reference count. A Dentry with a non-zero // reference count must remain coherent with the state of the filesystem. @@ -102,7 +105,7 @@ type DentryImpl interface { TryIncRef() bool // DecRef decrements the Dentry's reference count. - DecRef() + DecRef(ctx context.Context) // InotifyWithParent notifies all watches on the targets represented by this // dentry and its parent. The parent's watches are notified first, followed @@ -113,12 +116,29 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events uint32, cookie uint32, et EventType) + InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file // share the same watches. + // + // Watches may return nil if the dentry belongs to a FilesystemImpl that + // does not support inotify. If an implementation returns a non-nil watch + // set, it must always return a non-nil watch set. Likewise, if an + // implementation returns a nil watch set, it must always return a nil watch + // set. + // + // The caller does not need to hold a reference on the dentry. Watches() *Watches + + // OnZeroWatches is called whenever the number of watches on a dentry drops + // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage + // dentry lifetime. + // + // The caller does not need to hold a reference on the dentry. OnZeroWatches + // may acquire inotify locks, so to prevent deadlock, no inotify locks should + // be held by the caller. + OnZeroWatches(ctx context.Context) } // IncRef increments d's reference count. @@ -133,8 +153,8 @@ func (d *Dentry) TryIncRef() bool { } // DecRef decrements d's reference count. -func (d *Dentry) DecRef() { - d.impl.DecRef() +func (d *Dentry) DecRef(ctx context.Context) { + d.impl.DecRef(ctx) } // IsDead returns true if d has been deleted or invalidated by its owning @@ -149,17 +169,26 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } -// InotifyWithParent notifies all watches on the inodes for this dentry and +// InotifyWithParent notifies all watches on the targets represented by d and // its parent of events. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) { - d.impl.InotifyWithParent(events, cookie, et) +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) { + d.impl.InotifyWithParent(ctx, events, cookie, et) } // Watches returns the set of inotify watches associated with d. +// +// Watches will return nil if d belongs to a FilesystemImpl that does not +// support inotify. func (d *Dentry) Watches() *Watches { return d.impl.Watches() } +// OnZeroWatches performs cleanup tasks whenever the number of watches on a +// dentry drops to zero. +func (d *Dentry) OnZeroWatches(ctx context.Context) { + d.impl.OnZeroWatches(ctx) +} + // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. @@ -188,11 +217,11 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { +func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -200,12 +229,12 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { // did for reasons outside of VFS' control (e.g. d represents the local state // of a file on a remote filesystem on which the file has already been // deleted). -func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { +func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { d.mu.Lock() d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -215,8 +244,9 @@ func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry depending on the rename's outcome. // -// Preconditions: If to is not nil, it must be a child Dentry from the same -// Filesystem. from != to. +// Preconditions: +// * If to is not nil, it must be a child Dentry from the same Filesystem. +// * from != to. func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { @@ -252,13 +282,13 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { to.dead = true to.mu.Unlock() if to.isMounted() { - vfs.forgetDeadMountpoint(to) + vfs.forgetDeadMountpoint(ctx, to) } } } @@ -277,7 +307,7 @@ func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { // // forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). -func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { +func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) { var ( vdsToDecRef []VirtualDentry mountsToDecRef []*Mount @@ -290,9 +320,9 @@ func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 1e9dffc8f..dde2ad79b 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -22,6 +22,8 @@ import ( ) // DeviceKind indicates whether a device is a block or character device. +// +// +stateify savable type DeviceKind uint32 const ( @@ -44,6 +46,7 @@ func (kind DeviceKind) String() string { } } +// +stateify savable type devTuple struct { kind DeviceKind major uint32 diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 8297f964b..a98aac52b 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -27,21 +27,24 @@ import ( var epollCycleMu sync.Mutex // EpollInstance represents an epoll instance, as described by epoll(7). +// +// +stateify savable type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // q holds waiters on this EpollInstance. q waiter.Queue // interest is the set of file descriptors that are registered with the // EpollInstance for monitoring. interest is protected by interestMu. - interestMu sync.Mutex + interestMu sync.Mutex `state:"nosave"` interest map[epollInterestKey]*epollInterest // mu protects fields in registered epollInterests. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // ready is the set of file descriptors that may be "ready" for I/O. Note // that this must be an ordered list, not a map: "If more than maxevents @@ -54,6 +57,7 @@ type EpollInstance struct { ready epollInterestList } +// +stateify savable type epollInterestKey struct { // file is the registered FileDescription. No reference is held on file; // instead, when the last reference is dropped, FileDescription.DecRef() @@ -66,9 +70,11 @@ type epollInterestKey struct { } // epollInterest represents an EpollInstance's interest in a file descriptor. +// +// +stateify savable type epollInterest struct { // epoll is the owning EpollInstance. epoll is immutable. - epoll *EpollInstance + epoll *EpollInstance `state:"wait"` // key is the file to which this epollInterest applies. key is immutable. key epollInterestKey @@ -92,9 +98,9 @@ type epollInterest struct { // NewEpollInstanceFD returns a FileDescription representing a new epoll // instance. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { +func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { vd := vfs.NewAnonVirtualDentry("[eventpoll]") - defer vd.DecRef() + defer vd.DecRef(ctx) ep := &EpollInstance{ interest: make(map[epollInterestKey]*epollInterest), } @@ -109,7 +115,7 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { } // Release implements FileDescriptionImpl.Release. -func (ep *EpollInstance) Release() { +func (ep *EpollInstance) Release(ctx context.Context) { // Unregister all polled fds. ep.interestMu.Lock() defer ep.interestMu.Unlock() @@ -185,7 +191,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin } // Register interest in file. - mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP epi := &epollInterest{ epoll: ep, key: key, @@ -256,7 +262,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event } // Update epi for the next call to ep.ReadEvents(). - mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP ep.mu.Lock() epi.mask = mask epi.userData = event.Data @@ -330,11 +336,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) { ep.mu.Unlock() } -// ReadEvents reads up to len(events) ready events into events and returns the -// number of events read. -// -// Preconditions: len(events) != 0. -func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { +// ReadEvents appends up to maxReady events to events and returns the updated +// slice of events. +func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent { i := 0 // Hot path: avoid defer. ep.mu.Lock() @@ -367,16 +371,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { requeue.PushBack(epi) } // Report ievents. - events[i] = linux.EpollEvent{ + events = append(events, linux.EpollEvent{ Events: ievents.ToLinux(), Data: epi.userData, - } + }) i++ - if i == len(events) { + if i == maxEvents { break } } ep.ready.PushBackList(&requeue) ep.mu.Unlock() - return i + return events } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index bb294563d..546e445aa 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -37,19 +37,28 @@ import ( // FileDescription methods require that a reference is held. // // FileDescription is analogous to Linux's struct file. +// +// +stateify savable type FileDescription struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FileDescriptionRefs + + // flagsMu protects statusFlags and asyncHandler below. + flagsMu sync.Mutex `state:"nosave"` // statusFlags contains status flags, "initialized by open(2) and possibly - // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic - // memory operations. + // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic + // memory operations when it does not need to be synchronized with an + // access to asyncHandler. statusFlags uint32 + // asyncHandler handles O_ASYNC signal generation. It is set with the + // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must + // also be set by fcntl(2). + asyncHandler FileAsync + // epolls is the set of epollInterests registered for this FileDescription. // epolls is protected by epollMu. - epollMu sync.Mutex + epollMu sync.Mutex `state:"nosave"` epolls map[*epollInterest]struct{} // vd is the filesystem location at which this FileDescription was opened. @@ -73,15 +82,18 @@ type FileDescription struct { // writable is analogous to Linux's FMODE_WRITE. writable bool + usedLockBSD uint32 + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl } // FileDescriptionOptions contains options to FileDescription.Init(). +// +// +stateify savable type FileDescriptionOptions struct { - // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is - // usually only the case if O_DIRECT would actually have an effect. + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. AllowDirectIO bool // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE. @@ -93,7 +105,7 @@ type FileDescriptionOptions struct { // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, - // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling + // ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling // the corresponding FilesystemImpl methods instead of the corresponding // FileDescriptionImpl methods. // @@ -106,6 +118,10 @@ type FileDescriptionOptions struct { UseDentryMetadata bool } +// FileCreationFlags are the set of flags passed to FileDescription.Init() but +// omitted from FileDescription.StatusFlags(). +const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC + // Init must be called before first use of fd. If it succeeds, it takes // references on mnt and d. flags is the initial file description flags, which // is usually the full set of flags passed to open(2). @@ -117,11 +133,11 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou } } - fd.refs = 1 + fd.EnableLeakCheck() // Remove "file creation flags" to mirror the behavior from file.f_flags in - // fs/open.c:do_dentry_open - fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC) + // fs/open.c:do_dentry_open. + fd.statusFlags = flags &^ FileCreationFlags fd.vd = VirtualDentry{ mount: mnt, dentry: d, @@ -135,30 +151,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou return nil } -// IncRef increments fd's reference count. -func (fd *FileDescription) IncRef() { - atomic.AddInt64(&fd.refs, 1) -} - -// TryIncRef increments fd's reference count and returns true. If fd's -// reference count is already zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fd. -func (fd *FileDescription) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fd.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fd's reference count. -func (fd *FileDescription) DecRef() { - if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { +func (fd *FileDescription) DecRef(ctx context.Context) { + fd.FileDescriptionRefs.DecRef(func() { // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls @@ -175,21 +170,25 @@ func (fd *FileDescription) DecRef() { } ep.interestMu.Unlock() } + + // If BSD locks were used, release any lock that it may have acquired. + if atomic.LoadUint32(&fd.usedLockBSD) != 0 { + fd.impl.UnlockBSD(context.Background(), fd) + } + // Release implementation resources. - fd.impl.Release() + fd.impl.Release(ctx) if fd.writable { fd.vd.mount.EndWrite() } - fd.vd.DecRef() - } else if refs < 0 { - panic("FileDescription.DecRef() called without holding a reference") - } -} - -// Refs returns the current number of references. The returned count -// is inherently racy and is unsafe to use without external synchronization. -func (fd *FileDescription) Refs() int64 { - return atomic.LoadInt64(&fd.refs) + fd.vd.DecRef(ctx) + fd.flagsMu.Lock() + if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + fd.asyncHandler.Unregister(fd) + } + fd.asyncHandler = nil + fd.flagsMu.Unlock() + }) } // Mount returns the mount on which fd was opened. It does not take a reference @@ -262,9 +261,20 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { return syserror.EINVAL } - // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? + // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()? const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK + fd.flagsMu.Lock() + if fd.asyncHandler != nil { + // Use fd.statusFlags instead of oldFlags, which may have become outdated, + // to avoid double registering/unregistering. + if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 { + fd.asyncHandler.Unregister(fd) + } + } atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags)) + fd.flagsMu.Unlock() return nil } @@ -297,7 +307,7 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero // references. - Release() + Release(ctx context.Context) // OnClose is called when a file descriptor representing the // FileDescription is closed. Note that returning a non-nil error does not @@ -316,6 +326,15 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) + // Allocate grows the file to offset + length bytes. + // Only mode == 0 is supported currently. + // + // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on + // other files where it is not supported. + // + // Preconditions: The FileDescription was opened for writing. + Allocate(ctx context.Context, mode, offset, length uint64) error + // waiter.Waitable methods may be used to poll for I/O events. waiter.Waitable @@ -327,8 +346,9 @@ type FileDescriptionImpl interface { // // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. // - // Preconditions: The FileDescription was opened for reading. - // FileDescriptionOptions.DenyPRead == false. + // Preconditions: + // * The FileDescription was opened for reading. + // * FileDescriptionOptions.DenyPRead == false. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -359,8 +379,9 @@ type FileDescriptionImpl interface { // - If opts.Flags specifies unsupported options, PWrite returns // EOPNOTSUPP. // - // Preconditions: The FileDescription was opened for writing. - // FileDescriptionOptions.DenyPWrite == false. + // Preconditions: + // * The FileDescription was opened for writing. + // * FileDescriptionOptions.DenyPWrite == false. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -405,42 +426,36 @@ type FileDescriptionImpl interface { // Ioctl implements the ioctl(2) syscall. Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) - // Listxattr returns all extended attribute names for the file. - Listxattr(ctx context.Context, size uint64) ([]string, error) + // ListXattr returns all extended attribute names for the file. + ListXattr(ctx context.Context, size uint64) ([]string, error) - // Getxattr returns the value associated with the given extended attribute + // GetXattr returns the value associated with the given extended attribute // for the file. - Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) + GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) - // Setxattr changes the value associated with the given extended attribute + // SetXattr changes the value associated with the given extended attribute // for the file. - Setxattr(ctx context.Context, opts SetxattrOptions) error + SetXattr(ctx context.Context, opts SetXattrOptions) error - // Removexattr removes the given extended attribute from the file. - Removexattr(ctx context.Context, name string) error + // RemoveXattr removes the given extended attribute from the file. + RemoveXattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error - // LockBSD releases a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking + // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error // UnlockPOSIX releases a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error } // Dirent holds the information contained in struct linux_dirent64. +// +// +stateify savable type Dirent struct { // Name is the filename. Name string @@ -467,6 +482,15 @@ type IterDirentsCallback interface { Handle(dirent Dirent) error } +// IterDirentsCallbackFunc implements IterDirentsCallback for a function with +// the semantics of IterDirentsCallback.Handle. +type IterDirentsCallbackFunc func(dirent Dirent) error + +// Handle implements IterDirentsCallback.Handle. +func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error { + return f(dirent) +} + // OnClose is called when a file descriptor representing the FileDescription is // closed. Returning a non-nil error should not prevent the file descriptor // from being closed. @@ -483,7 +507,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St Start: fd.vd, }) stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return stat, err } return fd.impl.Stat(ctx, opts) @@ -498,7 +522,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.SetStat(ctx, opts) @@ -514,23 +538,37 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { Start: fd.vd, }) statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return statfs, err } return fd.impl.StatFS(ctx) } -// Readiness returns fd's I/O readiness. +// Allocate grows file represented by FileDescription to offset + length bytes. +func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !fd.IsWritable() { + return syserror.EBADF + } + return fd.impl.Allocate(ctx, mode, offset, length) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// It returns fd's I/O readiness. func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fd.impl.Readiness(mask) } -// EventRegister registers e for I/O readiness events in mask. +// EventRegister implements waiter.Waitable.EventRegister. +// +// It registers e for I/O readiness events in mask. func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.impl.EventRegister(e, mask) } -// EventUnregister unregisters e for I/O readiness events. +// EventUnregister implements waiter.Waitable.EventUnregister. +// +// It unregisters e for I/O readiness events. func (fd *FileDescription) EventUnregister(e *waiter.Entry) { fd.impl.EventUnregister(e) } @@ -605,25 +643,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. return fd.impl.Ioctl(ctx, uio, args) } -// Listxattr returns all extended attribute names for the file represented by +// ListXattr returns all extended attribute names for the file represented by // fd. // // If the size of the list (including a NUL terminating byte after every entry) // would exceed size, ERANGE may be returned. Note that implementations // are free to ignore size entirely and return without error). In all cases, // if size is 0, the list should be returned without error, regardless of size. -func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { +func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) - vfsObj.putResolvingPath(rp) + names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size) + vfsObj.putResolvingPath(ctx, rp) return names, err } - names, err := fd.impl.Listxattr(ctx, size) + names, err := fd.impl.ListXattr(ctx, size) if err == syserror.ENOTSUP { // Linux doesn't actually return ENOTSUP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security @@ -634,57 +672,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string return names, err } -// Getxattr returns the value associated with the given extended attribute for +// GetXattr returns the value associated with the given extended attribute for // the file represented by fd. // // If the size of the return value exceeds opts.Size, ERANGE may be returned // (note that implementations are free to ignore opts.Size entirely and return // without error). In all cases, if opts.Size is 0, the value should be // returned without error, regardless of size. -func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) { +func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts) + vfsObj.putResolvingPath(ctx, rp) return val, err } - return fd.impl.Getxattr(ctx, *opts) + return fd.impl.GetXattr(ctx, *opts) } -// Setxattr changes the value associated with the given extended attribute for +// SetXattr changes the value associated with the given extended attribute for // the file represented by fd. -func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error { +func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts) + vfsObj.putResolvingPath(ctx, rp) return err } - return fd.impl.Setxattr(ctx, *opts) + return fd.impl.SetXattr(ctx, *opts) } -// Removexattr removes the given extended attribute from the file represented +// RemoveXattr removes the given extended attribute from the file represented // by fd. -func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { +func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) - vfsObj.putResolvingPath(rp) + err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name) + vfsObj.putResolvingPath(ctx, rp) return err } - return fd.impl.Removexattr(ctx, name) + return fd.impl.RemoveXattr(ctx, name) } // SyncFS instructs the filesystem containing fd to execute the semantics of @@ -698,7 +736,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string { vfsroot := RootFromContext(ctx) s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) if vfsroot.Ok() { - vfsroot.DecRef() + vfsroot.DecRef(ctx) } return s } @@ -736,3 +774,95 @@ func (fd *FileDescription) InodeID() uint64 { func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { return fd.Sync(ctx) } + +// LockBSD tries to acquire a BSD-style advisory file lock. +func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { + atomic.StoreUint32(&fd.usedLockBSD, 1) + return fd.impl.LockBSD(ctx, fd, lockType, blocker) +} + +// UnlockBSD releases a BSD-style advisory file lock. +func (fd *FileDescription) UnlockBSD(ctx context.Context) error { + return fd.impl.UnlockBSD(ctx, fd) +} + +// LockPOSIX locks a POSIX-style file range lock. +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +} + +// UnlockPOSIX unlocks a POSIX-style file range lock. +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { + return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +} + +// A FileAsync sends signals to its owner when w is ready for IO. This is only +// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this +// interface to avoid circular dependencies. +type FileAsync interface { + Register(w waiter.Waitable) + Unregister(w waiter.Waitable) +} + +// AsyncHandler returns the FileAsync for fd. +func (fd *FileDescription) AsyncHandler() FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + return fd.asyncHandler +} + +// SetAsyncHandler sets fd.asyncHandler if it has not been set before and +// returns it. +func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + if fd.asyncHandler == nil { + fd.asyncHandler = newHandler() + if fd.statusFlags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } + } + return fd.asyncHandler +} + +// FileReadWriteSeeker is a helper struct to pass a FileDescription as +// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc. +type FileReadWriteSeeker struct { + FD *FileDescription + Ctx context.Context + ROpts ReadOptions + WOpts WriteOptions +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts) + return int(n), err +} + +// Read implements io.ReadWriteSeeker.Read. +func (f *FileReadWriteSeeker) Read(p []byte) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.Read(f.Ctx, dst, f.ROpts) + return int(n), err +} + +// Seek implements io.ReadWriteSeeker.Seek. +func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) { + return f.FD.Seek(f.Ctx, offset, int32(whence)) +} + +// WriteAt implements io.WriterAt.WriteAt. +func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts) + return int(n), err +} + +// Write implements io.ReadWriteSeeker.Write. +func (f *FileReadWriteSeeker) Write(p []byte) (int, error) { + buf := usermem.BytesIOSequence(p) + n, err := f.FD.Write(f.Ctx, buf, f.WOpts) + return int(n), err +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index f4c111926..48ca9de44 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -42,6 +42,8 @@ import ( // FileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl to obtain implementations of many FileDescriptionImpl // methods with default behavior analogous to Linux's. +// +// +stateify savable type FileDescriptionDefaultImpl struct{} // OnClose implements FileDescriptionImpl.OnClose analogously to @@ -56,6 +58,16 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err return linux.Statfs{}, syserror.ENOSYS } +// Allocate implements FileDescriptionImpl.Allocate analogously to +// fallocate called on an invalid type of file in Linux. +// +// Note that directories can rely on this implementation even though they +// should technically return EISDIR. Allocate should never be called for a +// directory, because it requires a writable fd. +func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + // Readiness implements waiter.Waitable.Readiness analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { @@ -128,56 +140,43 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg return 0, syserror.ENOTTY } -// Listxattr implements FileDescriptionImpl.Listxattr analogously to +// ListXattr implements FileDescriptionImpl.ListXattr analogously to // inode_operations::listxattr == NULL in Linux. -func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) { - // This isn't exactly accurate; see FileDescription.Listxattr. +func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) { + // This isn't exactly accurate; see FileDescription.ListXattr. return nil, syserror.ENOTSUP } -// Getxattr implements FileDescriptionImpl.Getxattr analogously to +// GetXattr implements FileDescriptionImpl.GetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) { +func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { return "", syserror.ENOTSUP } -// Setxattr implements FileDescriptionImpl.Setxattr analogously to +// SetXattr implements FileDescriptionImpl.SetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error { +func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error { return syserror.ENOTSUP } -// Removexattr implements FileDescriptionImpl.Removexattr analogously to +// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error { +func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error { return syserror.ENOTSUP } -// LockBSD implements FileDescriptionImpl.LockBSD. -func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockBSD implements FileDescriptionImpl.UnlockBSD. -func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - return syserror.EBADF -} - -// LockPOSIX implements FileDescriptionImpl.LockPOSIX. -func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. -func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - return syserror.EBADF -} - // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. +// +// +stateify savable type DirectoryFileDescriptionDefaultImpl struct{} +// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. +func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.EISDIR @@ -201,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme // DentryMetadataFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is // true to obtain implementations of Stat and SetStat that panic. +// +// +stateify savable type DentryMetadataFileDescriptionImpl struct{} // Stat implements FileDescriptionImpl.Stat. @@ -215,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt // DynamicBytesSource represents a data source for a // DynamicBytesFileDescriptionImpl. +// +// +stateify savable type DynamicBytesSource interface { // Generate writes the file's contents to buf. Generate(ctx context.Context, buf *bytes.Buffer) error } // StaticData implements DynamicBytesSource over a static string. +// +// +stateify savable type StaticData struct { Data string } @@ -247,14 +252,24 @@ type WritableDynamicBytesSource interface { // // DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first // use. +// +// +stateify savable type DynamicBytesFileDescriptionImpl struct { data DynamicBytesSource // immutable - mu sync.Mutex // protects the following fields - buf bytes.Buffer + mu sync.Mutex `state:"nosave"` // protects the following fields + buf bytes.Buffer `state:".([]byte)"` off int64 lastRead int64 // offset at which the last Read, PRead, or Seek ended } +func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte { + return fd.buf.Bytes() +} + +func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) { + fd.buf.Write(p) +} + // SetDataSource must be called exactly once on fd before first use. func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { fd.data = data @@ -347,7 +362,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { - return 0, syserror.EINVAL + return 0, syserror.EIO } n, err := writable.Write(ctx, src, offset) if err != nil { @@ -384,3 +399,58 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M fd.IncRef() return nil } + +// LockFD may be used by most implementations of FileDescriptionImpl.Lock* +// functions. Caller must call Init(). +// +// +stateify savable +type LockFD struct { + locks *FileLocks +} + +// Init initializes fd with FileLocks to use. +func (fd *LockFD) Init(locks *FileLocks) { + fd.locks = locks +} + +// Locks returns the locks associated with this file. +func (fd *LockFD) Locks() *FileLocks { + return fd.locks +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + fd.locks.UnlockBSD(uid) + return nil +} + +// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface +// returning ENOLCK. +// +// +stateify savable +type NoLockFD struct{} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return syserror.ENOLCK +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3a75d4d62..1cd607c0a 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -33,6 +33,7 @@ import ( type fileDescription struct { vfsfd FileDescription FileDescriptionDefaultImpl + NoLockFD } // genCount contains the number of times its DynamicBytesSource.Generate() @@ -79,9 +80,9 @@ type testFD struct { data DynamicBytesSource } -func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { +func newTestFD(ctx context.Context, vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(ctx) var fd testFD fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) @@ -89,7 +90,7 @@ func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesS } // Release implements FileDescriptionImpl.Release. -func (fd *testFD) Release() { +func (fd *testFD) Release(context.Context) { } // SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. @@ -108,11 +109,11 @@ func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &genCount{}) + defer fd.DecRef(ctx) // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) @@ -154,11 +155,11 @@ func TestGenCountFD(t *testing.T) { } // Write and PWrite fails. - if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) } - if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) } } @@ -166,11 +167,11 @@ func TestWritable(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &storeData{data: "init"}) + defer fd.DecRef(ctx) buf := make([]byte, 10) ioseq := usermem.BytesIOSequence(buf) diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 1edd584c9..c93d94634 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -15,8 +15,6 @@ package vfs import ( - "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" @@ -34,9 +32,7 @@ import ( // // +stateify savable type Filesystem struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FilesystemRefs // vfs is the VirtualFilesystem that uses this Filesystem. vfs is // immutable. @@ -52,7 +48,7 @@ type Filesystem struct { // Init must be called before first use of fs. func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { - fs.refs = 1 + fs.EnableLeakCheck() fs.vfs = vfsObj fs.fsType = fsType fs.impl = impl @@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } -// IncRef increments fs' reference count. -func (fs *Filesystem) IncRef() { - if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.IncRef() called without holding a reference") - } -} - -// TryIncRef increments fs' reference count and returns true. If fs' reference -// count is zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fs. -func (fs *Filesystem) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fs.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fs' reference count. -func (fs *Filesystem) DecRef() { - if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { +func (fs *Filesystem) DecRef(ctx context.Context) { + fs.FilesystemRefs.DecRef(func() { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() - fs.impl.Release() - } else if refs < 0 { - panic("Filesystem.decRef() called without holding a reference") - } + fs.impl.Release(ctx) + }) } // FilesystemImpl contains implementation details for a Filesystem. @@ -149,7 +120,7 @@ func (fs *Filesystem) DecRef() { type FilesystemImpl interface { // Release is called when the associated Filesystem reaches zero // references. - Release() + Release(ctx context.Context) // Sync "causes all pending modifications to filesystem metadata and cached // file data to be written to the underlying [filesystem]", as by syncfs(2). @@ -212,8 +183,9 @@ type FilesystemImpl interface { // ENOENT. Equivalently, if vd represents a file with a link count of 0 not // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If LinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -231,8 +203,9 @@ type FilesystemImpl interface { // - If the directory in which the new directory would be created has been // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If MkdirAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -253,8 +226,9 @@ type FilesystemImpl interface { // - If the directory in which the file would be created has been removed // by RmdirAt or RenameAt, MknodAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If MknodAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -345,11 +319,12 @@ type FilesystemImpl interface { // - If renaming would replace a non-empty directory, RenameAt returns // ENOTEMPTY. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a - // previous call to - // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is - // not "." or "..". + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). + // * oldParentVD.Dentry() was obtained from a previous call to + // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). + // * oldName is not "." or "..". // // Postconditions: If RenameAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -372,8 +347,9 @@ type FilesystemImpl interface { // - If the file at rp exists but is not a directory, RmdirAt returns // ENOTDIR. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If RmdirAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -410,8 +386,9 @@ type FilesystemImpl interface { // - If the directory in which the symbolic link would be created has been // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If SymlinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -431,33 +408,34 @@ type FilesystemImpl interface { // // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If UnlinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). UnlinkAt(ctx context.Context, rp *ResolvingPath) error - // ListxattrAt returns all extended attribute names for the file at rp. + // ListXattrAt returns all extended attribute names for the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, - // ListxattrAt returns ENOTSUP. + // ListXattrAt returns ENOTSUP. // // - If the size of the list (including a NUL terminating byte after every // entry) would exceed size, ERANGE may be returned. Note that // implementations are free to ignore size entirely and return without // error). In all cases, if size is 0, the list should be returned without // error, regardless of size. - ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) + ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) - // GetxattrAt returns the value associated with the given extended + // GetXattrAt returns the value associated with the given extended // attribute for the file at rp. // // Errors: // - // - If extended attributes are not supported by the filesystem, GetxattrAt + // - If extended attributes are not supported by the filesystem, GetXattrAt // returns ENOTSUP. // // - If an extended attribute named opts.Name does not exist, ENODATA is @@ -467,30 +445,30 @@ type FilesystemImpl interface { // returned (note that implementations are free to ignore opts.Size entirely // and return without error). In all cases, if opts.Size is 0, the value // should be returned without error, regardless of size. - GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) + GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) - // SetxattrAt changes the value associated with the given extended + // SetXattrAt changes the value associated with the given extended // attribute for the file at rp. // // Errors: // - // - If extended attributes are not supported by the filesystem, SetxattrAt + // - If extended attributes are not supported by the filesystem, SetXattrAt // returns ENOTSUP. // // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists, // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist, // ENODATA is returned. - SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error + SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error - // RemovexattrAt removes the given extended attribute from the file at rp. + // RemoveXattrAt removes the given extended attribute from the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, - // RemovexattrAt returns ENOTSUP. + // RemoveXattrAt returns ENOTSUP. // // - If name does not exist, ENODATA is returned. - RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. // @@ -524,12 +502,12 @@ type FilesystemImpl interface { // // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - - // TODO(gvisor.dev/issue/1479): inotify_add_watch() } // PrependPathAtVFSRootError is returned by implementations of // FilesystemImpl.PrependPath() when they encounter the contextual VFS root. +// +// +stateify savable type PrependPathAtVFSRootError struct{} // Error implements error.Error. @@ -540,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string { // PrependPathAtNonMountRootError is returned by implementations of // FilesystemImpl.PrependPath() when they encounter an independent ancestor // Dentry that is not the Mount root. +// +// +stateify savable type PrependPathAtNonMountRootError struct{} // Error implements error.Error. @@ -550,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string { // PrependPathSyntheticError is returned by implementations of // FilesystemImpl.PrependPath() for which prepended names do not represent real // paths. +// +// +stateify savable type PrependPathSyntheticError struct{} // Error implements error.Error. diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 465e610e0..2620cf975 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,6 +16,9 @@ package vfs import ( "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/usermem" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string { } return m } + +// GenericStatFS returns a statfs struct filled with the common fields for a +// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs(). +func GenericStatFS(fsMagic uint64) linux.Statfs { + return linux.Statfs{ + Type: fsMagic, + BlockSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + } +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index f2298f7f6..9d54cc4ed 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -33,6 +33,9 @@ type FilesystemType interface { // Name returns the name of this FilesystemType. Name() string + + // Release releases all resources held by this FilesystemType. + Release(ctx context.Context) } // GetFilesystemOptions contains options to FilesystemType.GetFilesystem. @@ -55,10 +58,13 @@ type registeredFilesystemType struct { // RegisterFilesystemTypeOptions contains options to // VirtualFilesystem.RegisterFilesystem(). +// +// +stateify savable type RegisterFilesystemTypeOptions struct { - // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt() - // for which MountOptions.InternalMount == false to use this filesystem - // type. + // AllowUserMount determines whether users are allowed to mount a file system + // of this type, i.e. through mount(2). If AllowUserMount is true, allow calls + // to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false + // to use this filesystem type. AllowUserMount bool // If AllowUserList is true, make this filesystem type visible in diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md new file mode 100644 index 000000000..833db213f --- /dev/null +++ b/pkg/sentry/vfs/g3doc/inotify.md @@ -0,0 +1,210 @@ +# Inotify + +Inotify is a mechanism for monitoring filesystem events in Linux--see +inotify(7). An inotify instance can be used to monitor files and directories for +modifications, creation/deletion, etc. The inotify API consists of system calls +that create inotify instances (inotify_init/inotify_init1) and add/remove +watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are +generated from various places in the sentry, including the syscall layer, the +vfs layer, the process fd table, and within each filesystem implementation. This +document outlines the implementation details of inotify in VFS2. + +## Inotify Objects + +Inotify data structures are implemented in the vfs package. + +### vfs.Inotify + +Inotify instances are represented by vfs.Inotify objects, which implement +vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a +pseudo-filesystem (anonfs). Each inotify instance receives events from a set of +vfs.Watch objects, which can be modified with inotify_add_watch(2) and +inotify_rm_watch(2). An application can retrieve events by reading the inotify +fd. + +### vfs.Watches + +The set of all watches held on a single file (i.e., the watch target) is stored +in vfs.Watches. Each watch will belong to a different inotify instance (an +instance can only have one watch on any watch target). The watches are stored in +a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions +to a single file will all share the same vfs.Watches (with the exception of the +gofer filesystem, described in a later section). Activity on the target causes +its vfs.Watches to generate notifications on its watches’ inotify instances. + +### vfs.Watch + +A single watch, owned by one inotify instance and applied to one watch target. +Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch, +which leads to some complicated locking behavior (see Lock Ordering). Whenever a +watch is notified of an event on its target, it will queue events to its inotify +instance for delivery to the user. + +### vfs.Event + +vfs.Event is a simple struct encapsulating all the fields for an inotify event. +It is generated by vfs.Watches and forwarded to the watches' owners. It is +serialized to the user during read(2) syscalls on the associated fs.Inotify's +fd. + +## Lock Ordering + +There are three locks related to the inotify implementation: + +Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue +lock. Watches.mu: the watch set lock, used to protect the collection of watches +on a target. + +The correct lock ordering for inotify code is: + +Inotify.mu -> Watches.mu -> Inotify.evMu. + +Note that we use a distinct lock to protect the inotify event queue. If we +simply used Inotify.mu, we could simultaneously have locks being acquired in the +order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would +cause deadlocks. For instance, adding a watch to an inotify instance would +require locking Inotify.mu, and then adding the same watch to the target would +cause Watches.mu to be held. At the same time, generating an event on the target +would require Watches.mu to be held before iterating through each watch, and +then notifying the owner of each watch would cause Inotify.mu to be held. + +See the vfs package comment to understand how inotify locks fit into the overall +ordering of filesystem locks. + +## Watch Targets in Different Filesystem Implementations + +In Linux, watches reside on inodes at the virtual filesystem layer. As a result, +all hard links and file descriptions on a single file will all share the same +watch set. In VFS2, there is no common inode structure across filesystem types +(some may not even have inodes), so we have to plumb inotify support through +each specific filesystem implementation. Some of the technical considerations +are outlined below. + +### Tmpfs + +For filesystems with inodes, like tmpfs, the design is quite similar to that of +Linux, where watches reside on the inode. + +### Pseudo-filesystems + +Technically, because inotify is implemented at the vfs layer in Linux, +pseudo-filesystems on top of kernfs support inotify passively. However, watches +can only track explicit filesystem operations like read/write, open/close, +mknod, etc., so watches on a target like /proc/self/fd will not generate events +every time a new fd is added or removed. As of this writing, we leave inotify +unimplemented in kernfs and anonfs; it does not seem particularly useful. + +### Gofer Filesystem (fsimpl/gofer) + +The gofer filesystem has several traits that make it difficult to support +inotify: + +* **There are no inodes.** A file is represented as a dentry that holds an + unopened p9 file (and possibly an open FID), through which the Sentry + interacts with the gofer. + * *Solution:* Because there is no inode structure stored in the sandbox, + inotify watches must be held on the dentry. For the purposes of inotify, + we assume that every dentry corresponds to a unique inode, which may + cause unexpected behavior in the presence of hard links, where multiple + dentries should share the same set of watches. Indeed, it is impossible + for us to be absolutely sure whether dentries correspond to the same + file or not, due to the following point: +* **The Sentry cannot always be aware of hard links on the remote + filesystem.** There is no way for us to confirm whether two files on the + remote filesystem are actually links to the same inode. QIDs and inodes are + not always 1:1. The assumption that dentries and inodes are 1:1 is + inevitably broken if there are remote hard links that we cannot detect. + * *Solution:* this is an issue with gofer fs in general, not only inotify, + and we will have to live with it. +* **Dentries can be cached, and then evicted.** Dentry lifetime does not + correspond to file lifetime. Because gofer fs is not entirely in-memory, the + absence of a dentry does not mean that the corresponding file does not + exist, nor does a dentry reaching zero references mean that the + corresponding file no longer exists. When a dentry reaches zero references, + it will be cached, in case the file at that path is needed again in the + future. However, the dentry may be evicted from the cache, which will cause + a new dentry to be created next time the same file path is used. The + existing watches will be lost. + * *Solution:* When a dentry reaches zero references, do not cache it if it + has any watches, so we can avoid eviction/destruction. Note that if the + dentry was deleted or invalidated (d.vfsd.IsDead()), we should still + destroy it along with its watches. Additionally, when a dentry’s last + watch is removed, we cache it if it also has zero references. This way, + the dentry can eventually be evicted from memory if it is no longer + needed. +* **Dentries can be invalidated.** Another issue with dentry lifetime is that + the remote file at the file path represented may change from underneath the + dentry. In this case, the next time that the dentry is used, it will be + invalidated and a new dentry will replace it. In this case, it is not clear + what should be done with the watches on the old dentry. + * *Solution:* Silently destroy the watches when invalidation occurs. We + have no way of knowing exactly what happened, when it happens. Inotify + instances on NFS files in Linux probably behave in a similar fashion, + since inotify is implemented at the vfs layer and is not aware of the + complexities of remote file systems. + * An alternative would be to issue some kind of event upon invalidation, + e.g. a delete event, but this has several issues: + * We cannot discern whether the remote file was invalidated because it was + moved, deleted, etc. This information is crucial, because these cases + should result in different events. Furthermore, the watches should only + be destroyed if the file has been deleted. + * Moreover, the mechanism for detecting whether the underlying file has + changed is to check whether a new QID is given by the gofer. This may + result in false positives, e.g. suppose that the server closed and + re-opened the same file, which may result in a new QID. + * Finally, the time of the event may be completely different from the time + of the file modification, since a dentry is not immediately notified + when the underlying file has changed. It would be quite unexpected to + receive the notification when invalidation was triggered, i.e. the next + time the file was accessed within the sandbox, because then the + read/write/etc. operation on the file would not result in the expected + event. + * Another point in favor of the first solution: inotify in Linux can + already be lossy on local filesystems (one of the sacrifices made so + that filesystem performance isn’t killed), and it is lossy on NFS for + similar reasons to gofer fs. Therefore, it is better for inotify to be + silent than to emit incorrect notifications. +* **There may be external users of the remote filesystem.** We can only track + operations performed on the file within the sandbox. This is sufficient + under InteropModeExclusive, but whenever there are external users, the set + of actions we are aware of is incomplete. + * *Solution:* We could either return an error or just issue a warning when + inotify is used without InteropModeExclusive. Although faulty, VFS1 + allows it when the filesystem is shared, and Linux does the same for + remote filesystems (as mentioned above, inotify sits at the vfs level). + +## Dentry Interface + +For events that must be generated above the vfs layer, we provide the following +DentryImpl methods to allow interactions with targets on any FilesystemImpl: + +* **InotifyWithParent()** generates events on the dentry’s watches as well as + its parent’s. +* **Watches()** retrieves the watch set of the target represented by the + dentry. This is used to access and modify watches on a target. +* **OnZeroWatches()** performs cleanup tasks after the last watch is removed + from a dentry. This is needed by gofer fs, which must allow a watched dentry + to be cached once it has no more watches. Most implementations can just do + nothing. Note that OnZeroWatches() must be called after all inotify locks + are released to preserve lock ordering, since it may acquire + FilesystemImpl-specific locks. + +## IN_EXCL_UNLINK + +There are several options that can be set for a watch, specified as part of the +mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some +additional support in each filesystem. + +A watch with IN_EXCL_UNLINK will not generate events for its target if it +corresponds to a path that was unlinked. For instance, if an fd is opened on +“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the +fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This +requires each DentryImpl to keep track of whether it has been unlinked, in order +to determine whether events should be sent to watches with IN_EXCL_UNLINK. + +## IN_ONESHOT + +One-shot watches expire after generating a single event. When an event occurs, +all one-shot watches on the target that successfully generated an event are +removed. Lock ordering can cause the management of one-shot watches to be quite +expensive; see Watches.Notify() for more information. diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go index 286510195..ba6e6ed49 100644 --- a/pkg/sentry/vfs/genericfstree/genericfstree.go +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -27,6 +27,8 @@ import ( ) // Dentry is a required type parameter that is a struct with the given fields. +// +// +stateify savable type Dentry struct { // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl. vfsd vfs.Dentry @@ -43,7 +45,7 @@ type Dentry struct { // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func IsAncestorDentry(d, d2 *Dentry) bool { - for { + for d2 != nil { // Stop at root, where d2.parent == nil. if d2.parent == d { return true } @@ -52,6 +54,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool { } d2 = d2.parent } + return false } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. @@ -68,7 +71,7 @@ func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { return vfs.PrependPathAtVFSRootError{} } - if &d.vfsd == mnt.Root() { + if mnt != nil && &d.vfsd == mnt.Root() { return nil } if d.parent == nil { @@ -78,3 +81,12 @@ func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath d = d.parent } } + +// DebugPathname returns a pathname to d relative to its filesystem root. +// DebugPathname does not correspond to any Linux function; it's used to +// generate dentry pathnames for debugging. +func DebugPathname(d *Dentry) string { + var b fspath.Builder + _ = PrependPath(vfs.VirtualDentry{}, nil, d, &b) + return b.String() +} diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 05a3051a4..3f0b8f45b 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16 // // The way events are labelled appears somewhat arbitrary, but they must match // Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +// +// +stateify savable type EventType uint8 // PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and @@ -49,14 +51,12 @@ const ( // Inotify represents an inotify instance created by inotify_init(2) or // inotify_init1(2). Inotify implements FileDescriptionImpl. // -// Lock ordering: -// Inotify.mu -> Watches.mu -> Inotify.evMu -// // +stateify savable type Inotify struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // Unique identifier for this inotify instance. We don't just reuse the // inotify fd because fds can be duped. These should not be exposed to the @@ -102,7 +102,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) id := uniqueid.GlobalFromContext(ctx) vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) - defer vd.DecRef() + defer vd.DecRef(ctx) fd := &Inotify{ id: id, scratch: make([]byte, inotifyEventBaseSize), @@ -120,19 +120,39 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // Release implements FileDescriptionImpl.Release. Release removes all // watches and frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { + var ds []*Dentry + // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.handleDeletion from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all // references to it below. i.mu.Lock() - defer i.mu.Unlock() for _, w := range i.watches { // Remove references to the watch from the watches set on the target. We // don't need to worry about the references from i.watches, since this // file description is about to be destroyed. - w.set.Remove(i.id) + d := w.target + ws := d.Watches() + // Watchable dentries should never return a nil watch set. + if ws == nil { + panic("Cannot remove watch from an unwatchable dentry") + } + ws.Remove(i.id) + if ws.Size() == 0 { + ds = append(ds, d) + } } + i.mu.Unlock() + + for _, d := range ds { + d.OnZeroWatches(ctx) + } +} + +// Allocate implements FileDescription.Allocate. +func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { + panic("Allocate should not be called on read-only inotify fds") } // EventRegister implements waiter.Waitable. @@ -161,12 +181,12 @@ func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { return mask & ready } -// PRead implements FileDescriptionImpl. +// PRead implements FileDescriptionImpl.PRead. func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.ESPIPE } -// PWrite implements FileDescriptionImpl. +// PWrite implements FileDescriptionImpl.PWrite. func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { return 0, syserror.ESPIPE } @@ -225,7 +245,7 @@ func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOpt return writeLen, nil } -// Ioctl implements fs.FileOperations.Ioctl. +// Ioctl implements FileDescriptionImpl.Ioctl. func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch args[1].Int() { case linux.FIONREAD: @@ -271,20 +291,19 @@ func (i *Inotify) queueEvent(ev *Event) { // newWatchLocked creates and adds a new watch to target. // -// Precondition: i.mu must be locked. -func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch { - targetWatches := target.Watches() +// Precondition: i.mu must be locked. ws must be the watch set for target d. +func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch { w := &Watch{ - owner: i, - wd: i.nextWatchIDLocked(), - set: targetWatches, - mask: mask, + owner: i, + wd: i.nextWatchIDLocked(), + target: d, + mask: mask, } // Hold the watch in this inotify instance as well as the watch set on the // target. i.watches[w.wd] = w - targetWatches.Add(w) + ws.Add(w) return w } @@ -296,22 +315,11 @@ func (i *Inotify) nextWatchIDLocked() int32 { return i.nextWatchMinusOne } -// handleDeletion handles the deletion of the target of watch w. It removes w -// from i.watches and a watch removal event is generated. -func (i *Inotify) handleDeletion(w *Watch) { - i.mu.Lock() - _, found := i.watches[w.wd] - delete(i.watches, w.wd) - i.mu.Unlock() - - if found { - i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) - } -} - // AddWatch constructs a new inotify watch and adds it to the target. It // returns the watch descriptor returned by inotify_add_watch(2). -func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { +// +// The caller must hold a reference on target. +func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // Note: Locking this inotify instance protects the result returned by // Lookup() below. With the lock held, we know for sure the lookup result // won't become stale because it's impossible for *this* instance to @@ -319,8 +327,14 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { i.mu.Lock() defer i.mu.Unlock() + ws := target.Watches() + if ws == nil { + // While Linux supports inotify watches on all filesystem types, watches on + // filesystems like kernfs are not generally useful, so we do not. + return 0, syserror.EPERM + } // Does the target already have a watch from this inotify instance? - if existing := target.Watches().Lookup(i.id); existing != nil { + if existing := ws.Lookup(i.id); existing != nil { newmask := mask if mask&linux.IN_MASK_ADD != 0 { // "Add (OR) events to watch mask for this pathname if it already @@ -328,17 +342,17 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { newmask |= atomic.LoadUint32(&existing.mask) } atomic.StoreUint32(&existing.mask, newmask) - return existing.wd + return existing.wd, nil } // No existing watch, create a new watch. - w := i.newWatchLocked(target, mask) - return w.wd + w := i.newWatchLocked(target, ws, mask) + return w.wd, nil } // RmWatch looks up an inotify watch for the given 'wd' and configures the // target to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -352,9 +366,19 @@ func (i *Inotify) RmWatch(wd int32) error { delete(i.watches, wd) // Remove the watch from the watch target. - w.set.Remove(w.OwnerID()) + ws := w.target.Watches() + // AddWatch ensures that w.target has a non-nil watch set. + if ws == nil { + panic("Watched dentry cannot have nil watch set") + } + ws.Remove(w.OwnerID()) + remaining := ws.Size() i.mu.Unlock() + if remaining == 0 { + w.target.OnZeroWatches(ctx) + } + // Generate the event for the removal. i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) @@ -373,6 +397,13 @@ type Watches struct { ws map[uint64]*Watch } +// Size returns the number of watches held by w. +func (w *Watches) Size() int { + w.mu.Lock() + defer w.mu.Unlock() + return len(w.ws) +} + // Lookup returns the watch owned by an inotify instance with the given id. // Returns nil if no such watch exists. // @@ -423,64 +454,86 @@ func (w *Watches) Remove(id uint64) { return } - if _, ok := w.ws[id]; !ok { - // While there's technically no problem with silently ignoring a missing - // watch, this is almost certainly a bug. - panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id)) + // It is possible for w.Remove() to be called for the same watch multiple + // times. See the treatment of one-shot watches in Watches.Notify(). + if _, ok := w.ws[id]; ok { + delete(w.ws, id) } - delete(w.ws, id) } -// Notify queues a new event with all watches in this set. -func (w *Watches) Notify(name string, events, cookie uint32, et EventType) { - w.NotifyWithExclusions(name, events, cookie, et, false) +// Notify queues a new event with watches in this set. Watches with +// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been +// unlinked. +func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) { + var hasExpired bool + w.mu.RLock() + for _, watch := range w.ws { + if unlinked && watch.ExcludeUnlinked() && et == PathEvent { + continue + } + if watch.Notify(name, events, cookie) { + hasExpired = true + } + } + w.mu.RUnlock() + + if hasExpired { + w.cleanupExpiredWatches(ctx) + } } -// NotifyWithExclusions queues a new event with watches in this set. Watches -// with IN_EXCL_UNLINK are skipped if the event is coming from a child that -// has been unlinked. -func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) { - // N.B. We don't defer the unlocks because Notify is in the hot path of - // all IO operations, and the defer costs too much for small IO - // operations. +// This function is relatively expensive and should only be called where there +// are expired watches. +func (w *Watches) cleanupExpiredWatches(ctx context.Context) { + // Because of lock ordering, we cannot acquire Inotify.mu for each watch + // owner while holding w.mu. As a result, store expired watches locally + // before removing. + var toRemove []*Watch w.mu.RLock() for _, watch := range w.ws { - if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent { - continue + if atomic.LoadInt32(&watch.expired) == 1 { + toRemove = append(toRemove, watch) } - watch.Notify(name, events, cookie) } w.mu.RUnlock() + for _, watch := range toRemove { + watch.owner.RmWatch(ctx, watch.wd) + } } -// HandleDeletion is called when the watch target is destroyed to emit -// the appropriate events. -func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent) +// HandleDeletion is called when the watch target is destroyed. Clear the +// watch set, detach watches from the inotify instances they belong to, and +// generate the appropriate events. +func (w *Watches) HandleDeletion(ctx context.Context) { + w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) - // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied - // by value. Ideally, we wouldn't have this circular locking so we can just - // notify of IN_DELETE_SELF in the same loop below. - // - // We can't hold w.mu while calling watch.handleDeletion to preserve lock - // ordering w.r.t to the owner inotify instances. Instead, atomically move - // the watches map into a local variable so we can iterate over it safely. - // - // Because of this however, it is possible for the watches' owners to reach - // this inode while the inode has no refs. This is still safe because the - // owners can only reach the inode until this function finishes calling - // watch.handleDeletion below and the inode is guaranteed to exist in the - // meantime. But we still have to be very careful not to rely on inode state - // that may have been already destroyed. + // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for + // the owner of each watch being deleted. Instead, atomically store the + // watches map in a local variable and set it to nil so we can iterate over + // it with the assurance that there will be no concurrent accesses. var ws map[uint64]*Watch w.mu.Lock() ws = w.ws w.ws = nil w.mu.Unlock() + // Remove each watch from its owner's watch set, and generate a corresponding + // watch removal event. for _, watch := range ws { - // TODO(gvisor.dev/issue/1479): consider refactoring this. - watch.handleDeletion() + i := watch.owner + i.mu.Lock() + _, found := i.watches[watch.wd] + delete(i.watches, watch.wd) + + // Release mutex before notifying waiters because we don't control what + // they can do. + i.mu.Unlock() + + // If watch was not found, it was removed from the inotify instance before + // we could get to it, in which case we should not generate an event. + if found { + i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) + } } } @@ -489,18 +542,28 @@ func (w *Watches) HandleDeletion() { // +stateify savable type Watch struct { // Inotify instance which owns this watch. + // + // This field is immutable after creation. owner *Inotify // Descriptor for this watch. This is unique across an inotify instance. + // + // This field is immutable after creation. wd int32 - // set is the watch set containing this watch. It belongs to the target file - // of this watch. - set *Watches + // target is a dentry representing the watch target. Its watch set contains this watch. + // + // This field is immutable after creation. + target *Dentry // Events being monitored via this watch. Must be accessed with atomic // memory operations. mask uint32 + + // expired is set to 1 to indicate that this watch is a one-shot that has + // already sent a notification and therefore can be removed. Must be accessed + // with atomic memory operations. + expired int32 } // OwnerID returns the id of the inotify instance that owns this watch. @@ -508,23 +571,29 @@ func (w *Watch) OwnerID() uint64 { return w.owner.id } -// ExcludeUnlinkedChildren indicates whether the watched object should continue -// to be notified of events of its children after they have been unlinked, e.g. -// for an open file descriptor. +// ExcludeUnlinked indicates whether the watched object should continue to be +// notified of events originating from a path that has been unlinked. // -// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK. -// We can do this by keeping track of the set of unlinked children in Watches -// to skip notification. -func (w *Watch) ExcludeUnlinkedChildren() bool { +// For example, if "foo/bar" is opened and then unlinked, operations on the +// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK. +func (w *Watch) ExcludeUnlinked() bool { return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 } -// Notify queues a new event on this watch. -func (w *Watch) Notify(name string, events uint32, cookie uint32) { +// Notify queues a new event on this watch. Returns true if this is a one-shot +// watch that should be deleted, after this event was successfully queued. +func (w *Watch) Notify(name string, events uint32, cookie uint32) bool { + if atomic.LoadInt32(&w.expired) == 1 { + // This is a one-shot watch that is already in the process of being + // removed. This may happen if a second event reaches the watch target + // before this watch has been removed. + return false + } + mask := atomic.LoadUint32(&w.mask) if mask&events == 0 { // We weren't watching for this event. - return + return false } // Event mask should include bits matched from the watch plus all control @@ -533,11 +602,11 @@ func (w *Watch) Notify(name string, events uint32, cookie uint32) { effectiveMask := unmaskableBits | mask matchedEvents := effectiveMask & events w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) -} - -// handleDeletion handles the deletion of w's target. -func (w *Watch) handleDeletion() { - w.owner.handleDeletion(w) + if mask&linux.IN_ONESHOT != 0 { + atomic.StoreInt32(&w.expired, 1) + return true + } + return false } // Event represents a struct inotify_event from linux. @@ -605,7 +674,7 @@ func (e *Event) setName(name string) { func (e *Event) sizeOf() int { s := inotifyEventBaseSize + int(e.len) if s < inotifyEventBaseSize { - panic("overflow") + panic("Overflowed event size") } return s } @@ -675,11 +744,15 @@ func InotifyEventFromStatMask(mask uint32) uint32 { } // InotifyRemoveChild sends the appriopriate notifications to the watch sets of -// the child being removed and its parent. -func InotifyRemoveChild(self, parent *Watches, name string) { - self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) - parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) - // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. +// the child being removed and its parent. Note that unlike most pairs of +// parent/child notifications, the child is notified first in this case. +func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) { + if self != nil { + self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) + } + if parent != nil { + parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) + } } // InotifyRename sends the appriopriate notifications to the watch sets of the @@ -690,8 +763,14 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, dirEv = linux.IN_ISDIR } cookie := uniqueid.InotifyCookie(ctx) - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) + if oldParent != nil { + oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) + } + if newParent != nil { + newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) + } // Somewhat surprisingly, self move events do not have a cookie. - renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) + if renamed != nil { + renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) + } } diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock.go index 724dfe743..1ff202f2a 100644 --- a/pkg/sentry/vfs/lock/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -12,14 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package lock provides POSIX and BSD style file locking for VFS2 file -// implementations. -// -// The actual implementations can be found in the lock package under -// sentry/fs/lock. -package lock +package vfs import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,6 +28,8 @@ import ( // Note that in Linux these two types of locks are _not_ cooperative, because // race and deadlock conditions make merging them prohibitive. We do the same // and keep them oblivious to each other. +// +// +stateify savable type FileLocks struct { // bsd is a set of BSD-style advisory file wide locks, see flock(2). bsd fslock.Locks @@ -44,7 +43,13 @@ func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fsloc if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { return nil } - return syserror.ErrWouldBlock + + // Return an appropriate error for the unsuccessful lock attempt, depending on + // whether this is a blocking or non-blocking operation. + if block == nil { + return syserror.ErrWouldBlock + } + return syserror.ERESTARTSYS } // UnlockBSD releases a BSD-style lock on the entire file. @@ -56,17 +61,58 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { } // LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { +func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } if fl.posix.LockRegion(uid, t, rng, block) { return nil } - return syserror.ErrWouldBlock + + // Return an appropriate error for the unsuccessful lock attempt, depending on + // whether this is a blocking or non-blocking operation. + if block == nil { + return syserror.ErrWouldBlock + } + return syserror.ERESTARTSYS } // UnlockPOSIX releases a POSIX-style lock on a file region. // // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } fl.posix.UnlockRegion(uid, rng) + return nil +} + +func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return fslock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return fslock.LockRange{}, err + } + off = int64(stat.Size) + default: + return fslock.LockRange{}, syserror.EINVAL + } + + return fslock.ComputeRange(int64(start), int64(length), off) } diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD deleted file mode 100644 index d9ab063b7..000000000 --- a/pkg/sentry/vfs/lock/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "lock", - srcs = ["lock.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/fs/lock", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go index cc1e7d764..638b5d830 100644 --- a/pkg/sentry/vfs/memxattr/xattr.go +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct { xattrs map[string]string } -// Getxattr returns the value at 'name'. -func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) { +// GetXattr returns the value at 'name'. +func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) { x.mu.RLock() value, ok := x.xattrs[opts.Name] x.mu.RUnlock() @@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, return value, nil } -// Setxattr sets 'value' at 'name'. -func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error { +// SetXattr sets 'value' at 'name'. +func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error { x.mu.Lock() defer x.mu.Unlock() if x.xattrs == nil { @@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error { return nil } -// Listxattr returns all names in xattrs. -func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) { +// ListXattr returns all names in xattrs. +func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) { // Keep track of the size of the buffer needed in listxattr(2) for the list. listSize := 0 x.mu.RLock() @@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) { return names, nil } -// Removexattr removes the xattr at 'name'. -func (x *SimpleExtendedAttributes) Removexattr(name string) error { +// RemoveXattr removes the xattr at 'name'. +func (x *SimpleExtendedAttributes) RemoveXattr(name string) error { x.mu.Lock() defer x.mu.Unlock() if _, ok := x.xattrs[name]; !ok { diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index e4ac6524b..d452d2cda 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) @@ -46,8 +47,9 @@ import ( // +stateify savable type Mount struct { // vfs, fs, root are immutable. References are held on fs and root. + // Note that for a disconnected mount, root may be nil. // - // Invariant: root belongs to fs. + // Invariant: if not nil, root belongs to fs. vfs *VirtualFilesystem fs *Filesystem root *Dentry @@ -55,13 +57,17 @@ type Mount struct { // ID is the immutable mount ID. ID uint64 + // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except + // for MS_RDONLY which is tracked in "writers". Immutable. + Flags MountFlags + // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on // key.parent and key.point if they are not nil. // // Invariant: key.parent != nil iff key.point != nil. key.point belongs to // key.parent.fs. - key mountKey + key mountKey `state:".(VirtualDentry)"` // ns is the namespace in which this Mount was mounted. ns is protected by // VirtualFilesystem.mountMu. @@ -81,10 +87,6 @@ type Mount struct { // umounted is true. umounted is protected by VirtualFilesystem.mountMu. umounted bool - // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except - // for MS_RDONLY which is tracked in "writers". - flags MountFlags - // The lower 63 bits of writers is the number of calls to // Mount.CheckBeginWrite() that have not yet been paired with a call to // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. @@ -95,16 +97,19 @@ type Mount struct { func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { mnt := &Mount{ ID: atomic.AddUint64(&vfs.lastMountID, 1), + Flags: opts.Flags, vfs: vfs, fs: fs, root: root, - flags: opts.Flags, ns: mntns, refs: 1, } if opts.ReadOnly { mnt.setReadOnlyLocked(true) } + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Register(mnt, "vfs.Mount") + } return mnt } @@ -113,13 +118,12 @@ func (mnt *Mount) Options() MountOptions { mnt.vfs.mountMu.Lock() defer mnt.vfs.mountMu.Unlock() return MountOptions{ - Flags: mnt.flags, - ReadOnly: mnt.readOnly(), + Flags: mnt.Flags, + ReadOnly: mnt.ReadOnly(), } } -// A MountNamespace is a collection of Mounts. -// +// A MountNamespace is a collection of Mounts.// // MountNamespaces are reference-counted. Unless otherwise specified, all // MountNamespace methods require that a reference is held. // @@ -127,13 +131,14 @@ func (mnt *Mount) Options() MountOptions { // // +stateify savable type MountNamespace struct { + MountNamespaceRefs + + // Owner is the usernamespace that owns this mount namespace. + Owner *auth.UserNamespace + // root is the MountNamespace's root mount. root is immutable. root *Mount - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 - // mountpoints maps all Dentries which are mount points in this namespace // to the number of Mounts for which they are mount points. mountpoints is // protected by VirtualFilesystem.mountMu. @@ -152,21 +157,22 @@ type MountNamespace struct { // NewMountNamespace returns a new mount namespace with a root filesystem // configured by the given arguments. A reference is taken on the returned // MountNamespace. -func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { ctx.Warningf("Unknown filesystem type: %s", fsTypeName) return nil, syserror.ENODEV } - fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return nil, err } mntns := &MountNamespace{ - refs: 1, + Owner: creds.UserNamespace, mountpoints: make(map[*Dentry]uint32), } - mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{}) + mntns.EnableLeakCheck() + mntns.root = newMount(vfs, fs, root, mntns, opts) return mntns, nil } @@ -197,8 +203,8 @@ func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth if err != nil { return nil, err } - defer root.DecRef() - defer fs.DecRef() + defer root.DecRef(ctx) + defer fs.DecRef(ctx) return vfs.NewDisconnectedMount(fs, root, opts) } @@ -218,7 +224,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr if vd.dentry.dead { vd.dentry.mu.Unlock() vfs.mountMu.Unlock() - vd.DecRef() + vd.DecRef(ctx) return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -240,7 +246,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr // This can't fail since we're holding vfs.mountMu. nextmnt.root.IncRef() vd.dentry.mu.Unlock() - vd.DecRef() + vd.DecRef(ctx) vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, @@ -260,16 +266,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr } // MountAt creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { +// The VirtualFilesystem will hold a reference to the Mount until it is unmounted. +// +// This method returns the mounted Mount without a reference, for convenience +// during VFS setup when there is no chance of racing with unmount. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) if err != nil { - return err + return nil, err } + defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { - mnt.DecRef() - return err + return nil, err } - return nil + return mnt, nil } // UmountAt removes the Mount at the given path. @@ -279,6 +289,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } // MNT_FORCE is currently unimplemented except for the permission check. + // Force unmounting specifically requires CAP_SYS_ADMIN in the root user + // namespace, and not in the owner user namespace for the target mount. See + // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { return syserror.EPERM } @@ -287,13 +300,13 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if err != nil { return err } - defer vd.DecRef() + defer vd.DecRef(ctx) if vd.dentry != vd.mount.root { return syserror.EINVAL } vfs.mountMu.Lock() if mntns := MountNamespaceFromContext(ctx); mntns != nil { - defer mntns.DecRef() + defer mntns.DecRef(ctx) if mntns != vd.mount.ns { vfs.mountMu.Unlock() return syserror.EINVAL @@ -329,14 +342,15 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } return nil } +// +stateify savable type umountRecursiveOptions struct { // If eager is true, ensure that future calls to Mount.tryIncMountedRef() // on umounted mounts fail. @@ -363,8 +377,9 @@ type umountRecursiveOptions struct { // // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). // -// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. +// Preconditions: +// * vfs.mountMu must be locked. +// * vfs.mounts.seq must be in a writer critical section. func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { if !mnt.umounted { mnt.umounted = true @@ -393,10 +408,19 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu // connectLocked makes vd the mount parent/point for mnt. It consumes // references held by vd. // -// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. d.mu must be locked. mnt.parent() == nil. +// Preconditions: +// * vfs.mountMu must be locked. +// * vfs.mounts.seq must be in a writer critical section. +// * d.mu must be locked. +// * mnt.parent() == nil, i.e. mnt must not already be connected. func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { - mnt.storeKey(vd) + if checkInvariants { + if mnt.parent() != nil { + panic("VFS.connectLocked called on connected mount") + } + } + mnt.IncRef() // dropped by callers of umountRecursiveLocked + mnt.setKey(vd) if vd.mount.children == nil { vd.mount.children = make(map[*Mount]struct{}) } @@ -416,11 +440,18 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns // disconnectLocked makes vd have no mount parent/point and returns its old // mount parent/point with a reference held. // -// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. mnt.parent() != nil. +// Preconditions: +// * vfs.mountMu must be locked. +// * vfs.mounts.seq must be in a writer critical section. +// * mnt.parent() != nil. func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { - vd := mnt.loadKey() - mnt.storeKey(VirtualDentry{}) + vd := mnt.getKey() + if checkInvariants { + if vd.mount != nil { + panic("VFS.disconnectLocked called on disconnected mount") + } + } + mnt.loadKey(VirtualDentry{}) delete(vd.mount.children, mnt) atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 mnt.ns.mountpoints[vd.dentry]-- @@ -461,36 +492,43 @@ func (mnt *Mount) IncRef() { } // DecRef decrements mnt's reference count. -func (mnt *Mount) DecRef() { - refs := atomic.AddInt64(&mnt.refs, -1) - if refs&^math.MinInt64 == 0 { // mask out MSB - var vd VirtualDentry - if mnt.parent() != nil { - mnt.vfs.mountMu.Lock() - mnt.vfs.mounts.seq.BeginWrite() - vd = mnt.vfs.disconnectLocked(mnt) - mnt.vfs.mounts.seq.EndWrite() - mnt.vfs.mountMu.Unlock() - } - mnt.root.DecRef() - mnt.fs.DecRef() - if vd.Ok() { - vd.DecRef() +func (mnt *Mount) DecRef(ctx context.Context) { + r := atomic.AddInt64(&mnt.refs, -1) + if r&^math.MinInt64 == 0 { // mask out MSB + if refsvfs2.LeakCheckEnabled() { + refsvfs2.Unregister(mnt, "vfs.Mount") } + mnt.destroy(ctx) } } -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") +func (mnt *Mount) destroy(ctx context.Context) { + var vd VirtualDentry + if mnt.parent() != nil { + mnt.vfs.mountMu.Lock() + mnt.vfs.mounts.seq.BeginWrite() + vd = mnt.vfs.disconnectLocked(mnt) + mnt.vfs.mounts.seq.EndWrite() + mnt.vfs.mountMu.Unlock() } + if mnt.root != nil { + mnt.root.DecRef(ctx) + } + mnt.fs.DecRef(ctx) + if vd.Ok() { + vd.DecRef(ctx) + } +} + +// LeakMessage implements refsvfs2.CheckedObject.LeakMessage. +func (mnt *Mount) LeakMessage() string { + return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, atomic.LoadInt64(&mnt.refs)) } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { +func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() - if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + mntns.MountNamespaceRefs.DecRef(func() { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ @@ -499,14 +537,12 @@ func (mntns *MountNamespace) DecRef() { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } + }) } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes @@ -516,7 +552,7 @@ func (mntns *MountNamespace) DecRef() { // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). // // Preconditions: References are held on mnt and d. -func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { +func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { // The first mount is special-cased: // // - The caller is assumed to have checked d.isMounted() already. (This @@ -547,7 +583,7 @@ retryFirst: // Raced with umount. continue } - mnt.DecRef() + mnt.DecRef(ctx) mnt = next d = next.root } @@ -558,9 +594,10 @@ retryFirst: // mnt. It takes a reference on the returned VirtualDentry. If no such mount // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). // -// Preconditions: References are held on mnt and root. vfsroot is not (mnt, -// mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { +// Preconditions: +// * References are held on mnt and root. +// * vfsroot is not (mnt, mnt.root). +func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. @@ -584,12 +621,12 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryFirst } mnt = parent @@ -617,22 +654,29 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryNotFirst } - d.DecRef() - mnt.DecRef() + d.DecRef(ctx) + mnt.DecRef(ctx) mnt = parent d = point } return VirtualDentry{mnt, d} } +// SetMountReadOnly sets the mount as ReadOnly. +func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + return mnt.setReadOnlyLocked(ro) +} + // CheckBeginWrite increments the counter of in-progress write operations on // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns // EROFS. @@ -670,7 +714,8 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error { return nil } -func (mnt *Mount) readOnly() bool { +// ReadOnly returns true if mount is readonly. +func (mnt *Mount) ReadOnly() bool { return atomic.LoadInt64(&mnt.writers) < 0 } @@ -698,14 +743,12 @@ func (mnt *Mount) Root() *Dentry { return mnt.root } -// Root returns mntns' root. A reference is taken on the returned -// VirtualDentry. +// Root returns mntns' root. It does not take a reference on the returned Dentry. func (mntns *MountNamespace) Root() VirtualDentry { vd := VirtualDentry{ mount: mntns.root, dentry: mntns.root.root, } - vd.IncRef() return vd } @@ -713,11 +756,23 @@ func (mntns *MountNamespace) Root() VirtualDentry { // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - vfs.mountMu.Lock() - defer vfs.mountMu.Unlock() rootMnt := taskRootDir.mount + + vfs.mountMu.Lock() mounts := rootMnt.submountsLocked() + // Take a reference on mounts since we need to drop vfs.mountMu before + // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). + for _, mnt := range mounts { + mnt.IncRef() + } + vfs.mountMu.Unlock() + defer func() { + for _, mnt := range mounts { + mnt.DecRef(ctx) + } + }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -728,7 +783,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -738,10 +793,13 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi } opts := "rw" - if mnt.readOnly() { + if mnt.ReadOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } @@ -759,11 +817,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - vfs.mountMu.Lock() - defer vfs.mountMu.Unlock() rootMnt := taskRootDir.mount + + vfs.mountMu.Lock() mounts := rootMnt.submountsLocked() + // Take a reference on mounts since we need to drop vfs.mountMu before + // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or + // vfs.StatAt() (=> FilesystemImpl.StatAt()). + for _, mnt := range mounts { + mnt.IncRef() + } + vfs.mountMu.Unlock() + defer func() { + for _, mnt := range mounts { + mnt.DecRef(ctx) + } + }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + + creds := auth.CredentialsFromContext(ctx) for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -774,7 +846,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -787,9 +859,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo Root: mntRootVD, Start: mntRootVD, } - statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) + statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) if err != nil { // Well that's not good. Ignore this mount. + ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) break } @@ -801,6 +874,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo fmt.Fprintf(buf, "%d ", mnt.ID) // (2) Parent ID (or this ID if there is no parent). + // Note that even if the call to mnt.parent() races with Mount + // destruction (which is possible since we're not holding vfs.mountMu), + // its Mount.ID will still be valid. pID := mnt.ID if p := mnt.parent(); p != nil { pID = p.ID @@ -823,14 +899,15 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo // (6) Mount options. opts := "rw" - if mnt.readOnly() { + if mnt.ReadOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } - // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is - // set. fmt.Fprintf(buf, "%s ", opts) // (7) Optional fields: zero or more fields of the form "tag[:value]". @@ -861,7 +938,7 @@ func superBlockOpts(mountPath string, mnt *Mount) string { // gVisor doesn't (yet) have a concept of super block options, so we // use the ro/rw bit from the mount flag. opts := "rw" - if mnt.readOnly() { + if mnt.ReadOnly() { opts = "ro" } diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index 3335e4057..cb8c56bd3 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) { mt.Init() mount := &Mount{} - mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) + mount.setKey(VirtualDentry{&Mount{}, &Dentry{}}) mt.Insert(mount) if m := mt.Lookup(mount.parent(), mount.point()); m != mount { @@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false func newBenchMount() *Mount { mount := &Mount{} - mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) + mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}}) return mount } @@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) { for i := 0; i < numMounts; i++ { mount := newBenchMount() mt.Insert(mount) - keys = append(keys, mount.loadKey()) + keys = append(keys, mount.saveKey()) } var ready sync.WaitGroup @@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := mount.loadKey() + key := mount.saveKey() ms[key] = mount keys = append(keys, key) } @@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := mount.loadKey() + key := mount.getKey() ms.Store(key, mount) keys = append(keys, key) } @@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms[mount.loadKey()] = mount + ms[mount.getKey()] = mount } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { var ms sync.Map for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms.Store(mount.loadKey(), mount) + ms.Store(mount.saveKey(), mount) } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms[mount.loadKey()] = mount + ms[mount.saveKey()] = mount } } @@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Store(mount.loadKey(), mount) + ms.Store(mount.saveKey(), mount) } } @@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := range mounts { mount := mounts[i] - ms[mount.loadKey()] = mount + ms[mount.saveKey()] = mount } b.ResetTimer() for i := range mounts { mount := mounts[i] - delete(ms, mount.loadKey()) + delete(ms, mount.saveKey()) } } @@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) { var ms sync.Map for i := range mounts { mount := mounts[i] - ms.Store(mount.loadKey(), mount) + ms.Store(mount.saveKey(), mount) } b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Delete(mount.loadKey()) + ms.Delete(mount.saveKey()) } } diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index 70f850ca4..cb48c37a1 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.16 +// +build !go1.17 // Check go:linkname function signatures when updating Go version. @@ -34,6 +34,8 @@ import ( // structurally identical to VirtualDentry, but stores its fields as // unsafe.Pointer since mutators synchronize with VFS path traversal using // seqcounts. +// +// This is explicitly not savable. type mountKey struct { parent unsafe.Pointer // *Mount point unsafe.Pointer // *Dentry @@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry { return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) } -func (mnt *Mount) loadKey() VirtualDentry { +func (mnt *Mount) getKey() VirtualDentry { return VirtualDentry{ mount: mnt.parent(), dentry: mnt.point(), } } +func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() } + // Invariant: mnt.key.parent == nil. vd.Ok(). -func (mnt *Mount) storeKey(vd VirtualDentry) { +func (mnt *Mount) setKey(vd VirtualDentry) { atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) } +func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) } + // mountTable maps (mount parent, mount point) pairs to mounts. It supports // efficient concurrent lookup, even in the presence of concurrent mutators // (provided mutation is sufficiently uncommon). @@ -205,6 +211,26 @@ loop: } } +// Range calls f on each Mount in mt. If f returns false, Range stops iteration +// and returns immediately. +func (mt *mountTable) Range(f func(*Mount) bool) { + tcap := uintptr(1) << (mt.size & mtSizeOrderMask) + slotPtr := mt.slots + last := unsafe.Pointer(uintptr(mt.slots) + ((tcap - 1) * mountSlotBytes)) + for { + slot := (*mountSlot)(slotPtr) + if slot.value != nil { + if !f((*Mount)(slot.value)) { + return + } + } + if slotPtr == last { + return + } + slotPtr = unsafe.Pointer(uintptr(slotPtr) + mountSlotBytes) + } +} + // Insert inserts the given mount into mt. // // Preconditions: mt must not already contain a Mount with the same mount point @@ -217,8 +243,9 @@ func (mt *mountTable) Insert(mount *Mount) { // insertSeqed inserts the given mount into mt. // -// Preconditions: mt.seq must be in a writer critical section. mt must not -// already contain a Mount with the same mount point and parent. +// Preconditions: +// * mt.seq must be in a writer critical section. +// * mt must not already contain a Mount with the same mount point and parent. func (mt *mountTable) insertSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) @@ -269,9 +296,11 @@ func (mt *mountTable) insertSeqed(mount *Mount) { atomic.StorePointer(&mt.slots, newSlots) } -// Preconditions: There are no concurrent mutators of the table (slots, cap). -// If the table is visible to readers, then mt.seq must be in a writer critical -// section. cap must be a power of 2. +// Preconditions: +// * There are no concurrent mutators of the table (slots, cap). +// * If the table is visible to readers, then mt.seq must be in a writer +// critical section. +// * cap must be a power of 2. func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { mask := cap - 1 off := (hash & mask) * mountSlotBytes @@ -313,8 +342,9 @@ func (mt *mountTable) Remove(mount *Mount) { // removeSeqed removes the given mount from mt. // -// Preconditions: mt.seq must be in a writer critical section. mt must contain -// mount. +// Preconditions: +// * mt.seq must be in a writer critical section. +// * mt must contain mount. func (mt *mountTable) removeSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) tcap := uintptr(1) << (mt.size & mtSizeOrderMask) diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 53d364c5c..bc79e5ecc 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -21,6 +21,8 @@ import ( // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and // FilesystemImpl.GetDentryAt(). +// +// +stateify savable type GetDentryOptions struct { // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that // the returned Dentry is a directory for which creds has search @@ -30,6 +32,8 @@ type GetDentryOptions struct { // MkdirOptions contains options to VirtualFilesystem.MkdirAt() and // FilesystemImpl.MkdirAt(). +// +// +stateify savable type MkdirOptions struct { // Mode is the file mode bits for the created directory. Mode linux.FileMode @@ -56,6 +60,8 @@ type MkdirOptions struct { // MknodOptions contains options to VirtualFilesystem.MknodAt() and // FilesystemImpl.MknodAt(). +// +// +stateify savable type MknodOptions struct { // Mode is the file type and mode bits for the created file. Mode linux.FileMode @@ -72,12 +78,31 @@ type MknodOptions struct { // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC. // MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers. +// +// +stateify savable type MountFlags struct { // NoExec is equivalent to MS_NOEXEC. NoExec bool + + // NoATime is equivalent to MS_NOATIME and indicates that the + // filesystem should not update access time in-place. + NoATime bool + + // NoDev is equivalent to MS_NODEV and indicates that the + // filesystem should not allow access to devices (special files). + // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE + // filesystems. + NoDev bool + + // NoSUID is equivalent to MS_NOSUID and indicates that the + // filesystem should not honor set-user-ID and set-group-ID bits or + // file capabilities when executing programs. + NoSUID bool } // MountOptions contains options to VirtualFilesystem.MountAt(). +// +// +stateify savable type MountOptions struct { // Flags contains flags as specified for mount(2), e.g. MS_NOEXEC. Flags MountFlags @@ -88,13 +113,17 @@ type MountOptions struct { // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). GetFilesystemOptions GetFilesystemOptions - // If InternalMount is true, allow the use of filesystem types for which - // RegisterFilesystemTypeOptions.AllowUserMount == false. + // InternalMount indicates whether the mount operation is coming from the + // application, i.e. through mount(2). If InternalMount is true, allow the use + // of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount + // == false. InternalMount bool } // OpenOptions contains options to VirtualFilesystem.OpenAt() and // FilesystemImpl.OpenAt(). +// +// +stateify savable type OpenOptions struct { // Flags contains access mode and flags as specified for open(2). // @@ -120,6 +149,8 @@ type OpenOptions struct { // ReadOptions contains options to FileDescription.PRead(), // FileDescriptionImpl.PRead(), FileDescription.Read(), and // FileDescriptionImpl.Read(). +// +// +stateify savable type ReadOptions struct { // Flags contains flags as specified for preadv2(2). Flags uint32 @@ -127,6 +158,8 @@ type ReadOptions struct { // RenameOptions contains options to VirtualFilesystem.RenameAt() and // FilesystemImpl.RenameAt(). +// +// +stateify savable type RenameOptions struct { // Flags contains flags as specified for renameat2(2). Flags uint32 @@ -138,6 +171,8 @@ type RenameOptions struct { // SetStatOptions contains options to VirtualFilesystem.SetStatAt(), // FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and // FileDescriptionImpl.SetStat(). +// +// +stateify savable type SetStatOptions struct { // Stat is the metadata that should be set. Only fields indicated by // Stat.Mask should be set. @@ -149,10 +184,18 @@ type SetStatOptions struct { // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask // instead). Stat linux.Statx + + // NeedWritePerm indicates that write permission on the file is needed for + // this operation. This is needed for truncate(2) (note that ftruncate(2) + // does not require the same check--instead, it checks that the fd is + // writable). + NeedWritePerm bool } // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() // and FilesystemImpl.BoundEndpointAt(). +// +// +stateify savable type BoundEndpointOptions struct { // Addr is the path of the file whose socket endpoint is being retrieved. // It is generally irrelevant: most endpoints are stored at a dentry that @@ -169,10 +212,12 @@ type BoundEndpointOptions struct { Addr string } -// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(), -// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and -// FileDescriptionImpl.Getxattr(). -type GetxattrOptions struct { +// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(), +// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and +// FileDescriptionImpl.GetXattr(). +// +// +stateify savable +type GetXattrOptions struct { // Name is the name of the extended attribute to retrieve. Name string @@ -183,10 +228,12 @@ type GetxattrOptions struct { Size uint64 } -// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), -// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and -// FileDescriptionImpl.Setxattr(). -type SetxattrOptions struct { +// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(), +// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and +// FileDescriptionImpl.SetXattr(). +// +// +stateify savable +type SetXattrOptions struct { // Name is the name of the extended attribute being mutated. Name string @@ -200,6 +247,8 @@ type SetxattrOptions struct { // StatOptions contains options to VirtualFilesystem.StatAt(), // FilesystemImpl.StatAt(), FileDescription.Stat(), and // FileDescriptionImpl.Stat(). +// +// +stateify savable type StatOptions struct { // Mask is the set of fields in the returned Statx that the FilesystemImpl // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. @@ -217,6 +266,8 @@ type StatOptions struct { } // UmountOptions contains options to VirtualFilesystem.UmountAt(). +// +// +stateify savable type UmountOptions struct { // Flags contains flags as specified for umount2(2). Flags uint32 @@ -225,6 +276,8 @@ type UmountOptions struct { // WriteOptions contains options to FileDescription.PWrite(), // FileDescriptionImpl.PWrite(), FileDescription.Write(), and // FileDescriptionImpl.Write(). +// +// +stateify savable type WriteOptions struct { // Flags contains flags as specified for pwritev2(2). Flags uint32 diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index cd78d66bc..e4da15009 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -47,7 +47,7 @@ func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() @@ -64,12 +64,12 @@ loop: // of FilesystemImpl.PrependPath() may return nil instead. break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -101,7 +101,7 @@ func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() loop: @@ -112,12 +112,12 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { return "", nil } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() unreachable := false @@ -157,13 +157,13 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { unreachable = true break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f9647f90e..d48520d58 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -16,6 +16,7 @@ package vfs import ( "math" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -25,6 +26,8 @@ import ( ) // AccessTypes is a bitmask of Unix file permissions. +// +// +stateify savable type AccessTypes uint16 // Bits in AccessTypes. @@ -94,6 +97,37 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu return syserror.EACCES } +// MayLink determines whether creating a hard link to a file with the given +// mode, kuid, and kgid is permitted. +// +// This corresponds to Linux's fs/namei.c:may_linkat. +func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + // Source inode owner can hardlink all they like; otherwise, it must be a + // safe source. + if CanActAsOwner(creds, kuid) { + return nil + } + + // Only regular files can be hard linked. + if mode.FileType() != linux.S_IFREG { + return syserror.EPERM + } + + // Setuid files should not get pinned to the filesystem. + if mode&linux.S_ISUID != 0 { + return syserror.EPERM + } + + // Executable setgid files should not get pinned to the filesystem, but we + // don't support S_IXGRP anyway. + + // Hardlinking to unreadable or unwritable sources is dangerous. + if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { + return syserror.EPERM + } + return nil +} + // AccessTypesForOpenFlags returns the access types required to open a file // with the given OpenOptions.Flags. Note that this is NOT the same thing as // the set of accesses permitted for the opened file: @@ -152,7 +186,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + stat := &opts.Stat if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { @@ -184,6 +219,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return syserror.EPERM } } + if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { + return err + } + } if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { if !CanActAsOwner(creds, kuid) { if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || @@ -199,6 +239,20 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return nil } +// CheckDeleteSticky checks whether the sticky bit is set on a directory with +// the given file mode, and if so, checks whether creds has permission to +// remove a file owned by childKUID from a directory with the given mode. +// CheckDeleteSticky is consistent with fs/linux.h:check_sticky(). +func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error { + if parentMode&linux.ModeSticky == 0 { + return nil + } + if CanActAsOwner(creds, childKUID) { + return nil + } + return syserror.EPERM +} + // CanActAsOwner returns true if creds can act as the owner of a file with the // given owning UID, consistent with Linux's // fs/inode.c:inode_owner_or_capable(). @@ -220,7 +274,7 @@ func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth // operation must not proceed. Otherwise it returns the max length allowed to // without violating the limit. func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { - fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur + fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur if fileSizeLimit > math.MaxInt64 { return size, nil } @@ -233,3 +287,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { } return size, nil } + +// CheckXattrPermissions checks permissions for extended attribute access. +// This is analogous to fs/xattr.c:xattr_permission(). Some key differences: +// * Does not check for read-only filesystem property. +// * Does not check inode immutability or append only mode. In both cases EPERM +// must be returned by filesystem implementations. +// * Does not do inode permission checks. Filesystem implementations should +// handle inode permission checks as they may differ across implementations. +func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error { + switch { + case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX): + // The trusted.* namespace can only be accessed by privileged + // users. + if creds.HasCapability(linux.CAP_SYS_ADMIN) { + return nil + } + if ats.MayWrite() { + return syserror.EPERM + } + return syserror.ENODATA + case strings.HasPrefix(name, linux.XATTR_USER_PREFIX): + // In the user.* namespace, only regular files and directories can have + // extended attributes. For sticky directories, only the owner and + // privileged users can write attributes. + filetype := mode.FileType() + if filetype != linux.ModeRegular && filetype != linux.ModeDirectory { + if ats.MayWrite() { + return syserror.EPERM + } + return syserror.ENODATA + } + if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) { + return syserror.EPERM + } + } + return nil +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 9d047ff88..e4fd55012 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -34,6 +35,8 @@ import ( // FilesystemImpl methods. // // ResolvingPath is loosely analogous to Linux's struct nameidata. +// +// +stateify savable type ResolvingPath struct { vfs *VirtualFilesystem root VirtualDentry // refs borrowed from PathOperation @@ -87,6 +90,7 @@ func init() { // so error "constants" are really mutable vars, necessitating somewhat // expensive interface object comparisons. +// +stateify savable type resolveMountRootOrJumpError struct{} // Error implements error.Error. @@ -94,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string { return "resolving mount root or jump" } +// +stateify savable type resolveMountPointError struct{} // Error implements error.Error. @@ -101,6 +106,7 @@ func (resolveMountPointError) Error() string { return "resolving mount point" } +// +stateify savable type resolveAbsSymlinkError struct{} // Error implements error.Error. @@ -136,31 +142,31 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat return rp } -func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { +func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) { rp.root = VirtualDentry{} - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = nil rp.start = nil - rp.releaseErrorState() + rp.releaseErrorState(ctx) resolvingPathPool.Put(rp) } -func (rp *ResolvingPath) decRefStartAndMount() { +func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) { if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.DecRef() + rp.start.DecRef(ctx) } if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.DecRef() + rp.mount.DecRef(ctx) } } -func (rp *ResolvingPath) releaseErrorState() { +func (rp *ResolvingPath) releaseErrorState(ctx context.Context) { if rp.nextStart != nil { - rp.nextStart.DecRef() + rp.nextStart.DecRef(ctx) rp.nextStart = nil } if rp.nextMount != nil { - rp.nextMount.DecRef() + rp.nextMount.DecRef(ctx) rp.nextMount = nil } } @@ -236,13 +242,13 @@ func (rp *ResolvingPath) Advance() { // Restart resets the stream of path components represented by rp to its state // on entry to the current FilesystemImpl method. -func (rp *ResolvingPath) Restart() { +func (rp *ResolvingPath) Restart(ctx context.Context) { rp.pit = rp.origParts[rp.numOrigParts-1] rp.mustBeDir = rp.mustBeDirOrig rp.symlinks = rp.symlinksOrig rp.curPart = rp.numOrigParts - 1 copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) - rp.releaseErrorState() + rp.releaseErrorState(ctx) } func (rp *ResolvingPath) relpathCommit() { @@ -260,13 +266,13 @@ func (rp *ResolvingPath) relpathCommit() { // Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path // resolution should resolve d's parent normally, and CheckRoot returns (false, // nil). -func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { +func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { // At contextual VFS root (due to e.g. chroot(2)). return true, nil } else if d == rp.mount.root { // At mount root ... - vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root) if vd.Ok() { // ... of non-root mount. rp.nextMount = vd.mount @@ -283,11 +289,11 @@ func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { // to d. If d is a mount point, such that path resolution should switch to // another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount // returns nil. -func (rp *ResolvingPath) CheckMount(d *Dentry) error { +func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error { if !d.isMounted() { return nil } - if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil { rp.nextMount = mnt return resolveMountPointError{} } @@ -389,11 +395,11 @@ func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { return resolveMountRootOrJumpError{} } -func (rp *ResolvingPath) handleError(err error) bool { +func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool { switch err.(type) { case resolveMountRootOrJumpError: // Switch to the new Mount. We hold references on the Mount and Dentry. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextStart rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -412,7 +418,7 @@ func (rp *ResolvingPath) handleError(err error) bool { case resolveMountPointError: // Switch to the new Mount. We hold a reference on the Mount, but // borrow the reference on the mount root from the Mount. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextMount.root rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef @@ -423,12 +429,12 @@ func (rp *ResolvingPath) handleError(err error) bool { // path. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true case resolveAbsSymlinkError: // Switch to the new Mount. References are borrowed from rp.root. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.root.mount rp.start = rp.root.dentry rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -440,7 +446,7 @@ func (rp *ResolvingPath) handleError(err error) bool { // path, including the symlink target we just prepended. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true default: diff --git a/pkg/sentry/vfs/save_restore.go b/pkg/sentry/vfs/save_restore.go new file mode 100644 index 000000000..46e50d55d --- /dev/null +++ b/pkg/sentry/vfs/save_restore.go @@ -0,0 +1,124 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refsvfs2" +) + +// FilesystemImplSaveRestoreExtension is an optional extension to +// FilesystemImpl. +type FilesystemImplSaveRestoreExtension interface { + // PrepareSave prepares this filesystem for serialization. + PrepareSave(ctx context.Context) error + + // CompleteRestore completes restoration from checkpoint for this + // filesystem after deserialization. + CompleteRestore(ctx context.Context, opts CompleteRestoreOptions) error +} + +// PrepareSave prepares all filesystems for serialization. +func (vfs *VirtualFilesystem) PrepareSave(ctx context.Context) error { + failures := 0 + for fs := range vfs.getFilesystems() { + if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok { + if err := ext.PrepareSave(ctx); err != nil { + ctx.Warningf("%T.PrepareSave failed: %v", fs.impl, err) + failures++ + } + } + fs.DecRef(ctx) + } + if failures != 0 { + return fmt.Errorf("%d filesystems failed to prepare for serialization", failures) + } + return nil +} + +// CompleteRestore completes restoration from checkpoint for all filesystems +// after deserialization. +func (vfs *VirtualFilesystem) CompleteRestore(ctx context.Context, opts *CompleteRestoreOptions) error { + failures := 0 + for fs := range vfs.getFilesystems() { + if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok { + if err := ext.CompleteRestore(ctx, *opts); err != nil { + ctx.Warningf("%T.CompleteRestore failed: %v", fs.impl, err) + failures++ + } + } + fs.DecRef(ctx) + } + if failures != 0 { + return fmt.Errorf("%d filesystems failed to complete restore after deserialization", failures) + } + return nil +} + +// CompleteRestoreOptions contains options to +// VirtualFilesystem.CompleteRestore() and +// FilesystemImplSaveRestoreExtension.CompleteRestore(). +type CompleteRestoreOptions struct { + // If ValidateFileSizes is true, filesystem implementations backed by + // remote filesystems should verify that file sizes have not changed + // between checkpoint and restore. + ValidateFileSizes bool + + // If ValidateFileModificationTimestamps is true, filesystem + // implementations backed by remote filesystems should validate that file + // mtimes have not changed between checkpoint and restore. + ValidateFileModificationTimestamps bool +} + +// saveMounts is called by stateify. +func (vfs *VirtualFilesystem) saveMounts() []*Mount { + if atomic.LoadPointer(&vfs.mounts.slots) == nil { + // vfs.Init() was never called. + return nil + } + var mounts []*Mount + vfs.mounts.Range(func(mount *Mount) bool { + mounts = append(mounts, mount) + return true + }) + return mounts +} + +// loadMounts is called by stateify. +func (vfs *VirtualFilesystem) loadMounts(mounts []*Mount) { + if mounts == nil { + return + } + vfs.mounts.Init() + for _, mount := range mounts { + vfs.mounts.Insert(mount) + } +} + +func (mnt *Mount) afterLoad() { + if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&mnt.refs) != 0 { + refsvfs2.Register(mnt, "vfs.Mount") + } +} + +// afterLoad is called by stateify. +func (epi *epollInterest) afterLoad() { + // Mark all epollInterests as ready after restore so that the next call to + // EpollInstance.ReadEvents() rechecks their readiness. + epi.Callback(nil) +} diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 52643a7c5..48d6252f7 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -24,6 +24,9 @@ // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry // VirtualFilesystem.filesystemsMu // EpollInstance.mu +// Inotify.mu +// Watches.mu +// Inotify.evMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding @@ -33,6 +36,7 @@ package vfs import ( "fmt" + "path" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -67,7 +71,7 @@ type VirtualFilesystem struct { // points. // // mounts is analogous to Linux's mount_hashtable. - mounts mountTable + mounts mountTable `state:".([]*Mount)"` // mountpoints maps mount points to mounts at those points in all // namespaces. mountpoints is protected by mountMu. @@ -119,7 +123,10 @@ type VirtualFilesystem struct { } // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. -func (vfs *VirtualFilesystem) Init() error { +func (vfs *VirtualFilesystem) Init(ctx context.Context) error { + if vfs.mountpoints != nil { + panic("VFS already initialized") + } vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) vfs.devices = make(map[devTuple]*registeredDevice) vfs.anonBlockDevMinorNext = 1 @@ -139,7 +146,7 @@ func (vfs *VirtualFilesystem) Init() error { devMinor: anonfsDevMinor, } anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) - defer anonfs.vfsfs.DecRef() + defer anonfs.vfsfs.DecRef(ctx) anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) if err != nil { // We should not be passing any MountOptions that would cause @@ -151,11 +158,23 @@ func (vfs *VirtualFilesystem) Init() error { return nil } +// Release drops references on filesystem objects held by vfs. +// +// Precondition: This must be called after VFS.Init() has succeeded. +func (vfs *VirtualFilesystem) Release(ctx context.Context) { + vfs.anonMount.DecRef(ctx) + for _, fst := range vfs.fsTypes { + fst.fsType.Release(ctx) + } +} + // PathOperation specifies the path operated on by a VFS method. // // PathOperation is passed to VFS methods by pointer to reduce memory copying: // it's somewhat large and should never escape. (Options structs are passed by // pointer to VFS and FileDescription methods for the same reason.) +// +// +stateify savable type PathOperation struct { // Root is the VFS root. References on Root are borrowed from the provider // of the PathOperation. @@ -186,11 +205,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -208,11 +227,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede dentry: d, } rp.mount.IncRef() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return vd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, err } } @@ -230,7 +249,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au } rp.mount.IncRef() name := rp.Component() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return parentVD, name, nil } if checkInvariants { @@ -238,8 +257,8 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, "", err } } @@ -254,14 +273,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential } if !newpop.Path.Begin.Ok() { - oldVD.DecRef() + oldVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EEXIST } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldVD.DecRef() + oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") return syserror.EINVAL } @@ -270,8 +289,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential for { err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) if err == nil { - vfs.putResolvingPath(rp) - oldVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return nil } if checkInvariants { @@ -279,9 +298,9 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return err } } @@ -290,6 +309,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential // MkdirAt creates a directory at the given path. func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with mkdirat(dirfd, "", mode). if pop.Path.Absolute { return syserror.EEXIST } @@ -307,7 +328,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -315,8 +336,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -326,6 +347,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia // error from the syserror package. func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with mknodat(dirfd, "", mode, dev). if pop.Path.Absolute { return syserror.EEXIST } @@ -340,7 +363,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -348,8 +371,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -402,31 +425,31 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential for { fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) if opts.FileExec { - if fd.Mount().flags.NoExec { - fd.DecRef() + if fd.Mount().Flags.NoExec { + fd.DecRef(ctx) return nil, syserror.EACCES } // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { - fd.DecRef() + fd.DecRef(ctx) return nil, err } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } } - fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) + fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) return fd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -438,11 +461,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden for { target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return target, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -466,19 +489,19 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti return err } if oldName == "." || oldName == ".." { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) return syserror.EBUSY } if !newpop.Path.Begin.Ok() { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EBUSY } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") return syserror.EINVAL } @@ -491,8 +514,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) if err == nil { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return nil } if checkInvariants { @@ -500,9 +523,9 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return err } } @@ -511,6 +534,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti // RmdirAt removes the directory at the given path. func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). if pop.Path.Absolute { return syserror.EBUSY } @@ -525,7 +550,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.RmdirAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -533,8 +558,8 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -546,11 +571,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -562,11 +587,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential for { stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return stat, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statx{}, err } } @@ -579,11 +604,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti for { statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return statfs, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statfs{}, err } } @@ -592,6 +617,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti // SymlinkAt creates a symbolic link at the given path with the given target. func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with symlinkat(oldpath, newdirfd, ""). if pop.Path.Absolute { return syserror.EEXIST } @@ -606,7 +633,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -614,8 +641,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -624,6 +651,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent // UnlinkAt deletes the non-directory file at the given path. func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with unlinkat(dirfd, "", 0). if pop.Path.Absolute { return syserror.EBUSY } @@ -638,7 +667,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.UnlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -646,8 +675,8 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -655,17 +684,11 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti // BoundEndpointAt gets the bound endpoint at the given path, if one exists. func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { - if !pop.Path.Begin.Ok() { - if pop.Path.Absolute { - return nil, syserror.ECONNREFUSED - } - return nil, syserror.ENOENT - } rp := vfs.getResolvingPath(creds, pop) for { bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return bep, nil } if checkInvariants { @@ -673,21 +696,21 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } } -// ListxattrAt returns all extended attribute names for the file at the given +// ListXattrAt returns all extended attribute names for the file at the given // path. -func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { +func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { rp := vfs.getResolvingPath(creds, pop) for { - names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) + names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return names, nil } if err == syserror.ENOTSUP { @@ -695,61 +718,61 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by // default don't exist. - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } } -// GetxattrAt returns the value associated with the given extended attribute +// GetXattrAt returns the value associated with the given extended attribute // for the file at the given path. -func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) { +func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { rp := vfs.getResolvingPath(creds, pop) for { - val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) + val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return val, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } } -// SetxattrAt changes the value associated with the given extended attribute +// SetXattrAt changes the value associated with the given extended attribute // for the file at the given path. -func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { +func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { rp := vfs.getResolvingPath(creds, pop) for { - err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } } -// RemovexattrAt removes the given extended attribute from the file at rp. -func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { +// RemoveXattrAt removes the given extended attribute from the file at rp. +func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { rp := vfs.getResolvingPath(creds, pop) for { - err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) + err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -757,23 +780,83 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre // SyncAllFilesystems has the semantics of Linux's sync(2). func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { + var retErr error + for fs := range vfs.getFilesystems() { + if err := fs.impl.Sync(ctx); err != nil && retErr == nil { + retErr = err + } + fs.DecRef(ctx) + } + return retErr +} + +func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} { fss := make(map[*Filesystem]struct{}) vfs.filesystemsMu.Lock() + defer vfs.filesystemsMu.Unlock() for fs := range vfs.filesystems { if !fs.TryIncRef() { continue } fss[fs] = struct{}{} } - vfs.filesystemsMu.Unlock() - var retErr error - for fs := range fss { - if err := fs.impl.Sync(ctx); err != nil && retErr == nil { - retErr = err - } - fs.DecRef() + return fss +} + +// MkdirAllAt recursively creates non-existent directories on the given path +// (including the last component). +func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error { + pop := &PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) + switch err { + case nil: + if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory { + return syserror.ENOTDIR + } + // Directory already exists. + return nil + case syserror.ENOENT: + // Expected, we will create the dir. + default: + return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) + } + + // Recurse to ensure parent is created and then create the final directory. + if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil { + return err } - return retErr + if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { + return fmt.Errorf("failed to create directory %q: %w", currentPath, err) + } + return nil +} + +// MakeSyntheticMountpoint creates parent directories of target if they do not +// exist and attempts to create a directory for the mountpoint. If a +// non-directory file already exists there then we allow it. +func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { + mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + + // Make sure the parent directory of target exists. + if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil { + return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) + } + + // Attempt to mkdir the final component. If a file (of any type) exists + // then we let allow mounting on top of that because we do not require the + // target to be an existing directory, unlike Linux mount(2). + if err := vfs.MkdirAt(ctx, creds, &PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(target), + }, mkdirOpts); err != nil && err != syserror.EEXIST { + return fmt.Errorf("failed to create mountpoint %q: %w", target, err) + } + return nil } // A VirtualDentry represents a node in a VFS tree, by combining a Dentry @@ -825,9 +908,9 @@ func (vd VirtualDentry) IncRef() { // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. -func (vd VirtualDentry) DecRef() { - vd.dentry.DecRef() - vd.mount.DecRef() +func (vd VirtualDentry) DecRef(ctx context.Context) { + vd.dentry.DecRef(ctx) + vd.mount.DecRef(ctx) } // Mount returns the Mount associated with vd. It does not take a reference on diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 101497ed6..bbafb8b7f 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -77,7 +77,10 @@ var DefaultOpts = Opts{ // trigger it. const descheduleThreshold = 1 * time.Second -var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") +var ( + stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout") + stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") +) // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. var stackDumpSameTaskPeriod = time.Minute @@ -93,15 +96,33 @@ const ( Panic ) +// Set implements flag.Value. +func (a *Action) Set(v string) error { + switch v { + case "log", "logwarning": + *a = LogWarning + case "panic": + *a = Panic + default: + return fmt.Errorf("invalid watchdog action %q", v) + } + return nil +} + +// Get implements flag.Value. +func (a *Action) Get() interface{} { + return *a +} + // String returns Action's string representation. -func (a Action) String() string { - switch a { +func (a *Action) String() string { + switch *a { case LogWarning: - return "LogWarning" + return "logWarning" case Panic: - return "Panic" + return "panic" default: - panic(fmt.Sprintf("Invalid action: %d", a)) + panic(fmt.Sprintf("Invalid watchdog action: %d", *a)) } } @@ -220,6 +241,9 @@ func (w *Watchdog) waitForStart() { // We are fine. return } + + stuckStartup.Increment() + var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) w.doAction(w.StartupTimeoutAction, false, &buf) @@ -323,13 +347,13 @@ func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound boo func (w *Watchdog) reportStuckWatchdog() { var buf bytes.Buffer - buf.WriteString("Watchdog goroutine is stuck:") + buf.WriteString("Watchdog goroutine is stuck") w.doAction(w.TaskTimeoutAction, false, &buf) } // doAction will take the given action. If the action is LogWarning, the stack -// is not always dumpped to the log to prevent log flooding. "forceStack" -// guarantees that the stack will be dumped regarless. +// is not always dumped to the log to prevent log flooding. "forceStack" +// guarantees that the stack will be dumped regardless. func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) { switch action { case LogWarning: |