diff options
Diffstat (limited to 'pkg/sentry')
132 files changed, 1251 insertions, 1383 deletions
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index c9c52530d..61dacd2fb 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -14,12 +14,8 @@ go_library( "arch_x86.go", "arch_x86_impl.go", "auxv.go", - "signal.go", - "signal_act.go", "signal_amd64.go", "signal_arm64.go", - "signal_info.go", - "signal_stack.go", "stack.go", "stack_unsafe.go", "syscalls_amd64.go", diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 290863ee6..c9393b091 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -134,21 +134,13 @@ type Context interface { // RegisterMap returns a map of all registers. RegisterMap() (map[string]uintptr, error) - // NewSignalAct returns a new object that is equivalent to struct sigaction - // in the guest architecture. - NewSignalAct() NativeSignalAct - - // NewSignalStack returns a new object that is equivalent to stack_t in the - // guest architecture. - NewSignalStack() NativeSignalStack - // SignalSetup modifies the context in preparation for handling the // given signal. // // st is the stack where the signal handler frame should be // constructed. // - // act is the SignalAct that specifies how this signal is being + // act is the SigAction that specifies how this signal is being // handled. // // info is the SignalInfo of the signal being delivered. @@ -157,7 +149,7 @@ type Context interface { // stack is not going to be used). // // sigset is the signal mask before entering the signal handler. - SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error + SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet) error // SignalRestore restores context after returning from a signal // handler. @@ -167,7 +159,7 @@ type Context interface { // rt is true if SignalRestore is being entered from rt_sigreturn and // false if SignalRestore is being entered from sigreturn. // SignalRestore returns the thread's new signal mask. - SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) + SignalRestore(st *Stack, rt bool) (linux.SignalSet, linux.SignalStack, error) // CPUIDEmulate emulates a CPUID instruction according to current register state. CPUIDEmulate(l log.Logger) diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go deleted file mode 100644 index 67d7edf68..000000000 --- a/pkg/sentry/arch/signal.go +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package arch - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/hostarch" -) - -// SignalAct represents the action that should be taken when a signal is -// delivered, and is equivalent to struct sigaction. -// -// +marshal -// +stateify savable -type SignalAct struct { - Handler uint64 - Flags uint64 - Restorer uint64 // Only used on amd64. - Mask linux.SignalSet -} - -// SerializeFrom implements NativeSignalAct.SerializeFrom. -func (s *SignalAct) SerializeFrom(other *SignalAct) { - *s = *other -} - -// DeserializeTo implements NativeSignalAct.DeserializeTo. -func (s *SignalAct) DeserializeTo(other *SignalAct) { - *other = *s -} - -// SignalStack represents information about a user stack, and is equivalent to -// stack_t. -// -// +marshal -// +stateify savable -type SignalStack struct { - Addr uint64 - Flags uint32 - _ uint32 - Size uint64 -} - -// SerializeFrom implements NativeSignalStack.SerializeFrom. -func (s *SignalStack) SerializeFrom(other *SignalStack) { - *s = *other -} - -// DeserializeTo implements NativeSignalStack.DeserializeTo. -func (s *SignalStack) DeserializeTo(other *SignalStack) { - *other = *s -} - -// SignalInfo represents information about a signal being delivered, and is -// equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h). -// -// +marshal -// +stateify savable -type SignalInfo struct { - Signo int32 // Signal number - Errno int32 // Errno value - Code int32 // Signal code - _ uint32 - - // struct siginfo::_sifields is a union. In SignalInfo, fields in the union - // are accessed through methods. - // - // For reference, here is the definition of _sifields: (_sigfault._trapno, - // which does not exist on x86, omitted for clarity) - // - // union { - // int _pad[SI_PAD_SIZE]; - // - // /* kill() */ - // struct { - // __kernel_pid_t _pid; /* sender's pid */ - // __ARCH_SI_UID_T _uid; /* sender's uid */ - // } _kill; - // - // /* POSIX.1b timers */ - // struct { - // __kernel_timer_t _tid; /* timer id */ - // int _overrun; /* overrun count */ - // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; - // sigval_t _sigval; /* same as below */ - // int _sys_private; /* not to be passed to user */ - // } _timer; - // - // /* POSIX.1b signals */ - // struct { - // __kernel_pid_t _pid; /* sender's pid */ - // __ARCH_SI_UID_T _uid; /* sender's uid */ - // sigval_t _sigval; - // } _rt; - // - // /* SIGCHLD */ - // struct { - // __kernel_pid_t _pid; /* which child */ - // __ARCH_SI_UID_T _uid; /* sender's uid */ - // int _status; /* exit code */ - // __ARCH_SI_CLOCK_T _utime; - // __ARCH_SI_CLOCK_T _stime; - // } _sigchld; - // - // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ - // struct { - // void *_addr; /* faulting insn/memory ref. */ - // short _addr_lsb; /* LSB of the reported address */ - // } _sigfault; - // - // /* SIGPOLL */ - // struct { - // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ - // int _fd; - // } _sigpoll; - // - // /* SIGSYS */ - // struct { - // void *_call_addr; /* calling user insn */ - // int _syscall; /* triggering system call number */ - // unsigned int _arch; /* AUDIT_ARCH_* of syscall */ - // } _sigsys; - // } _sifields; - // - // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128 - // bytes. - Fields [128 - 16]byte -} - -// FixSignalCodeForUser fixes up si_code. -// -// The si_code we get from Linux may contain the kernel-specific code in the -// top 16 bits if it's positive (e.g., from ptrace). Linux's -// copy_siginfo_to_user does -// err |= __put_user((short)from->si_code, &to->si_code); -// to mask out those bits and we need to do the same. -func (s *SignalInfo) FixSignalCodeForUser() { - if s.Code > 0 { - s.Code &= 0x0000ffff - } -} - -// PID returns the si_pid field. -func (s *SignalInfo) PID() int32 { - return int32(hostarch.ByteOrder.Uint32(s.Fields[0:4])) -} - -// SetPID mutates the si_pid field. -func (s *SignalInfo) SetPID(val int32) { - hostarch.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) -} - -// UID returns the si_uid field. -func (s *SignalInfo) UID() int32 { - return int32(hostarch.ByteOrder.Uint32(s.Fields[4:8])) -} - -// SetUID mutates the si_uid field. -func (s *SignalInfo) SetUID(val int32) { - hostarch.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) -} - -// Sigval returns the sigval field, which is aliased to both si_int and si_ptr. -func (s *SignalInfo) Sigval() uint64 { - return hostarch.ByteOrder.Uint64(s.Fields[8:16]) -} - -// SetSigval mutates the sigval field. -func (s *SignalInfo) SetSigval(val uint64) { - hostarch.ByteOrder.PutUint64(s.Fields[8:16], val) -} - -// TimerID returns the si_timerid field. -func (s *SignalInfo) TimerID() linux.TimerID { - return linux.TimerID(hostarch.ByteOrder.Uint32(s.Fields[0:4])) -} - -// SetTimerID sets the si_timerid field. -func (s *SignalInfo) SetTimerID(val linux.TimerID) { - hostarch.ByteOrder.PutUint32(s.Fields[0:4], uint32(val)) -} - -// Overrun returns the si_overrun field. -func (s *SignalInfo) Overrun() int32 { - return int32(hostarch.ByteOrder.Uint32(s.Fields[4:8])) -} - -// SetOverrun sets the si_overrun field. -func (s *SignalInfo) SetOverrun(val int32) { - hostarch.ByteOrder.PutUint32(s.Fields[4:8], uint32(val)) -} - -// Addr returns the si_addr field. -func (s *SignalInfo) Addr() uint64 { - return hostarch.ByteOrder.Uint64(s.Fields[0:8]) -} - -// SetAddr sets the si_addr field. -func (s *SignalInfo) SetAddr(val uint64) { - hostarch.ByteOrder.PutUint64(s.Fields[0:8], val) -} - -// Status returns the si_status field. -func (s *SignalInfo) Status() int32 { - return int32(hostarch.ByteOrder.Uint32(s.Fields[8:12])) -} - -// SetStatus mutates the si_status field. -func (s *SignalInfo) SetStatus(val int32) { - hostarch.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) -} - -// CallAddr returns the si_call_addr field. -func (s *SignalInfo) CallAddr() uint64 { - return hostarch.ByteOrder.Uint64(s.Fields[0:8]) -} - -// SetCallAddr mutates the si_call_addr field. -func (s *SignalInfo) SetCallAddr(val uint64) { - hostarch.ByteOrder.PutUint64(s.Fields[0:8], val) -} - -// Syscall returns the si_syscall field. -func (s *SignalInfo) Syscall() int32 { - return int32(hostarch.ByteOrder.Uint32(s.Fields[8:12])) -} - -// SetSyscall mutates the si_syscall field. -func (s *SignalInfo) SetSyscall(val int32) { - hostarch.ByteOrder.PutUint32(s.Fields[8:12], uint32(val)) -} - -// Arch returns the si_arch field. -func (s *SignalInfo) Arch() uint32 { - return hostarch.ByteOrder.Uint32(s.Fields[12:16]) -} - -// SetArch mutates the si_arch field. -func (s *SignalInfo) SetArch(val uint32) { - hostarch.ByteOrder.PutUint32(s.Fields[12:16], val) -} - -// Band returns the si_band field. -func (s *SignalInfo) Band() int64 { - return int64(hostarch.ByteOrder.Uint64(s.Fields[0:8])) -} - -// SetBand mutates the si_band field. -func (s *SignalInfo) SetBand(val int64) { - // Note: this assumes the platform uses `long` as `__ARCH_SI_BAND_T`. - // On some platforms, which gVisor doesn't support, `__ARCH_SI_BAND_T` is - // `int`. See siginfo.h. - hostarch.ByteOrder.PutUint64(s.Fields[0:8], uint64(val)) -} - -// FD returns the si_fd field. -func (s *SignalInfo) FD() uint32 { - return hostarch.ByteOrder.Uint32(s.Fields[8:12]) -} - -// SetFD mutates the si_fd field. -func (s *SignalInfo) SetFD(val uint32) { - hostarch.ByteOrder.PutUint32(s.Fields[8:12], val) -} diff --git a/pkg/sentry/arch/signal_act.go b/pkg/sentry/arch/signal_act.go deleted file mode 100644 index d3e2324a8..000000000 --- a/pkg/sentry/arch/signal_act.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package arch - -import "gvisor.dev/gvisor/pkg/marshal" - -// Special values for SignalAct.Handler. -const ( - // SignalActDefault is SIG_DFL and specifies that the default behavior for - // a signal should be taken. - SignalActDefault = 0 - - // SignalActIgnore is SIG_IGN and specifies that a signal should be - // ignored. - SignalActIgnore = 1 -) - -// Available signal flags. -const ( - SignalFlagNoCldStop = 0x00000001 - SignalFlagNoCldWait = 0x00000002 - SignalFlagSigInfo = 0x00000004 - SignalFlagRestorer = 0x04000000 - SignalFlagOnStack = 0x08000000 - SignalFlagRestart = 0x10000000 - SignalFlagInterrupt = 0x20000000 - SignalFlagNoDefer = 0x40000000 - SignalFlagResetHandler = 0x80000000 -) - -// IsSigInfo returns true iff this handle expects siginfo. -func (s SignalAct) IsSigInfo() bool { - return s.Flags&SignalFlagSigInfo != 0 -} - -// IsNoDefer returns true iff this SignalAct has the NoDefer flag set. -func (s SignalAct) IsNoDefer() bool { - return s.Flags&SignalFlagNoDefer != 0 -} - -// IsRestart returns true iff this SignalAct has the Restart flag set. -func (s SignalAct) IsRestart() bool { - return s.Flags&SignalFlagRestart != 0 -} - -// IsResetHandler returns true iff this SignalAct has the ResetHandler flag set. -func (s SignalAct) IsResetHandler() bool { - return s.Flags&SignalFlagResetHandler != 0 -} - -// IsOnStack returns true iff this SignalAct has the OnStack flag set. -func (s SignalAct) IsOnStack() bool { - return s.Flags&SignalFlagOnStack != 0 -} - -// HasRestorer returns true iff this SignalAct has the Restorer flag set. -func (s SignalAct) HasRestorer() bool { - return s.Flags&SignalFlagRestorer != 0 -} - -// NativeSignalAct is a type that is equivalent to struct sigaction in the -// guest architecture. -type NativeSignalAct interface { - marshal.Marshallable - - // SerializeFrom copies the data in the host SignalAct s into this object. - SerializeFrom(s *SignalAct) - - // DeserializeTo copies the data in this object into the host SignalAct s. - DeserializeTo(s *SignalAct) -} diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 082ed92b1..58e28dbba 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -76,21 +76,11 @@ const ( type UContext64 struct { Flags uint64 Link uint64 - Stack SignalStack + Stack linux.SignalStack MContext SignalContext64 Sigset linux.SignalSet } -// NewSignalAct implements Context.NewSignalAct. -func (c *context64) NewSignalAct() NativeSignalAct { - return &SignalAct{} -} - -// NewSignalStack implements Context.NewSignalStack. -func (c *context64) NewSignalStack() NativeSignalStack { - return &SignalStack{} -} - // From Linux 'arch/x86/include/uapi/asm/sigcontext.h' the following is the // size of the magic cookie at the end of the xsave frame. // @@ -110,7 +100,7 @@ func (c *context64) fpuFrameSize() (size int, useXsave bool) { // SignalSetup implements Context.SignalSetup. (Compare to Linux's // arch/x86/kernel/signal.c:__setup_rt_frame().) -func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error { +func (c *context64) SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet) error { sp := st.Bottom // "The 128-byte area beyond the location pointed to by %rsp is considered @@ -187,7 +177,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt // Prior to proceeding, figure out if the frame will exhaust the range // for the signal stack. This is not allowed, and should immediately // force signal delivery (reverting to the default handler). - if act.IsOnStack() && alt.IsEnabled() && !alt.Contains(frameBottom) { + if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() && !alt.Contains(frameBottom) { return unix.EFAULT } @@ -203,7 +193,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt return err } ucAddr := st.Bottom - if act.HasRestorer() { + if act.Flags&linux.SA_RESTORER != 0 { // Push the restorer return address. // Note that this doesn't need to be popped. if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil { @@ -237,15 +227,15 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt // SignalRestore implements Context.SignalRestore. (Compare to Linux's // arch/x86/kernel/signal.c:sys_rt_sigreturn().) -func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) { +func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, linux.SignalStack, error) { // Copy out the stack frame. var uc UContext64 if _, err := uc.CopyIn(st, StackBottomMagic); err != nil { - return 0, SignalStack{}, err + return 0, linux.SignalStack{}, err } - var info SignalInfo + var info linux.SignalInfo if _, err := info.CopyIn(st, StackBottomMagic); err != nil { - return 0, SignalStack{}, err + return 0, linux.SignalStack{}, err } // Restore registers. diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index da71fb873..80df90076 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -61,7 +61,7 @@ type FpsimdContext struct { type UContext64 struct { Flags uint64 Link uint64 - Stack SignalStack + Stack linux.SignalStack Sigset linux.SignalSet // glibc uses a 1024-bit sigset_t _pad [120]byte // (1024 - 64) / 8 = 120 @@ -71,18 +71,8 @@ type UContext64 struct { MContext SignalContext64 } -// NewSignalAct implements Context.NewSignalAct. -func (c *context64) NewSignalAct() NativeSignalAct { - return &SignalAct{} -} - -// NewSignalStack implements Context.NewSignalStack. -func (c *context64) NewSignalStack() NativeSignalStack { - return &SignalStack{} -} - // SignalSetup implements Context.SignalSetup. -func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error { +func (c *context64) SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet) error { sp := st.Bottom // Construct the UContext64 now since we need its size. @@ -114,7 +104,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt // Prior to proceeding, figure out if the frame will exhaust the range // for the signal stack. This is not allowed, and should immediately // force signal delivery (reverting to the default handler). - if act.IsOnStack() && alt.IsEnabled() && !alt.Contains(frameBottom) { + if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() && !alt.Contains(frameBottom) { return unix.EFAULT } @@ -137,7 +127,7 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt c.Regs.Regs[0] = uint64(info.Signo) c.Regs.Regs[1] = uint64(infoAddr) c.Regs.Regs[2] = uint64(ucAddr) - c.Regs.Regs[30] = uint64(act.Restorer) + c.Regs.Regs[30] = act.Restorer // Save the thread's floating point state. c.sigFPState = append(c.sigFPState, c.fpState) @@ -147,15 +137,15 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt } // SignalRestore implements Context.SignalRestore. -func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) { +func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, linux.SignalStack, error) { // Copy out the stack frame. var uc UContext64 if _, err := uc.CopyIn(st, StackBottomMagic); err != nil { - return 0, SignalStack{}, err + return 0, linux.SignalStack{}, err } - var info SignalInfo + var info linux.SignalInfo if _, err := info.CopyIn(st, StackBottomMagic); err != nil { - return 0, SignalStack{}, err + return 0, linux.SignalStack{}, err } // Restore registers. diff --git a/pkg/sentry/arch/signal_info.go b/pkg/sentry/arch/signal_info.go deleted file mode 100644 index f93ee8b46..000000000 --- a/pkg/sentry/arch/signal_info.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package arch - -// Possible values for SignalInfo.Code. These values originate from the Linux -// kernel's include/uapi/asm-generic/siginfo.h. -const ( - // SignalInfoUser (properly SI_USER) indicates that a signal was sent from - // a kill() or raise() syscall. - SignalInfoUser = 0 - - // SignalInfoKernel (properly SI_KERNEL) indicates that the signal was sent - // by the kernel. - SignalInfoKernel = 0x80 - - // SignalInfoTimer (properly SI_TIMER) indicates that the signal was sent - // by an expired timer. - SignalInfoTimer = -2 - - // SignalInfoTkill (properly SI_TKILL) indicates that the signal was sent - // from a tkill() or tgkill() syscall. - SignalInfoTkill = -6 - - // CLD_* codes are only meaningful for SIGCHLD. - - // CLD_EXITED indicates that a task exited. - CLD_EXITED = 1 - - // CLD_KILLED indicates that a task was killed by a signal. - CLD_KILLED = 2 - - // CLD_DUMPED indicates that a task was killed by a signal and then dumped - // core. - CLD_DUMPED = 3 - - // CLD_TRAPPED indicates that a task was stopped by ptrace. - CLD_TRAPPED = 4 - - // CLD_STOPPED indicates that a thread group completed a group stop. - CLD_STOPPED = 5 - - // CLD_CONTINUED indicates that a group-stopped thread group was continued. - CLD_CONTINUED = 6 - - // SYS_* codes are only meaningful for SIGSYS. - - // SYS_SECCOMP indicates that a signal originates from seccomp. - SYS_SECCOMP = 1 - - // TRAP_* codes are only meaningful for SIGTRAP. - - // TRAP_BRKPT indicates a breakpoint trap. - TRAP_BRKPT = 1 -) diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go deleted file mode 100644 index c732c7503..000000000 --- a/pkg/sentry/arch/signal_stack.go +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build 386 amd64 arm64 - -package arch - -import ( - "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/marshal" -) - -const ( - // SignalStackFlagOnStack is possible set on return from getaltstack, - // in order to indicate that the thread is currently on the alt stack. - SignalStackFlagOnStack = 1 - - // SignalStackFlagDisable is a flag to indicate the stack is disabled. - SignalStackFlagDisable = 2 -) - -// IsEnabled returns true iff this signal stack is marked as enabled. -func (s SignalStack) IsEnabled() bool { - return s.Flags&SignalStackFlagDisable == 0 -} - -// Top returns the stack's top address. -func (s SignalStack) Top() hostarch.Addr { - return hostarch.Addr(s.Addr + s.Size) -} - -// SetOnStack marks this signal stack as in use. -// -// Note that there is no corresponding ClearOnStack, and that this should only -// be called on copies that are serialized to userspace. -func (s *SignalStack) SetOnStack() { - s.Flags |= SignalStackFlagOnStack -} - -// Contains checks if the stack pointer is within this stack. -func (s *SignalStack) Contains(sp hostarch.Addr) bool { - return hostarch.Addr(s.Addr) < sp && sp <= hostarch.Addr(s.Addr+s.Size) -} - -// NativeSignalStack is a type that is equivalent to stack_t in the guest -// architecture. -type NativeSignalStack interface { - marshal.Marshallable - - // SerializeFrom copies the data in the host SignalStack s into this - // object. - SerializeFrom(s *SignalStack) - - // DeserializeTo copies the data in this object into the host SignalStack - // s. - DeserializeTo(s *SignalStack) -} diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go index 65a794c7c..85e3515af 100644 --- a/pkg/sentry/arch/stack.go +++ b/pkg/sentry/arch/stack.go @@ -45,7 +45,7 @@ type Stack struct { } // scratchBufLen is the default length of Stack.scratchBuf. The -// largest structs the stack regularly serializes are arch.SignalInfo +// largest structs the stack regularly serializes are linux.SignalInfo // and arch.UContext64. We'll set the default size as the larger of // the two, arch.UContext64. var scratchBufLen = (*UContext64)(nil).SizeBytes() diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 367849e75..221e98a01 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -99,6 +99,9 @@ type ExecArgs struct { // PIDNamespace is the pid namespace for the process being executed. PIDNamespace *kernel.PIDNamespace + + // Limits is the limit set for the process being executed. + Limits *limits.LimitSet } // String prints the arguments as a string. @@ -151,6 +154,10 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI if pidns == nil { pidns = proc.Kernel.RootPIDNamespace() } + limitSet := args.Limits + if limitSet == nil { + limitSet = limits.NewLimitSet() + } initArgs := kernel.CreateProcessArgs{ Filename: args.Filename, Argv: args.Argv, @@ -161,7 +168,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI Credentials: creds, FDTable: fdTable, Umask: 0022, - Limits: limits.NewLimitSet(), + Limits: limitSet, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: proc.Kernel.RootUTSNamespace(), IPCNamespace: proc.Kernel.RootIPCNamespace(), diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go index b90f7c1be..4c99944e7 100644 --- a/pkg/sentry/fs/attr.go +++ b/pkg/sentry/fs/attr.go @@ -478,6 +478,20 @@ func (f FilePermissions) AnyRead() bool { return f.User.Read || f.Group.Read || f.Other.Read } +// HasSetUIDOrGID returns true if either the setuid or setgid bit is set. +func (f FilePermissions) HasSetUIDOrGID() bool { + return f.SetUID || f.SetGID +} + +// DropSetUIDAndMaybeGID turns off setuid, and turns off setgid if f allows +// group execution. +func (f *FilePermissions) DropSetUIDAndMaybeGID() { + f.SetUID = false + if f.Group.Execute { + f.SetGID = false + } +} + // FileOwner represents ownership of a file. // // +stateify savable diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index e84ba7a5d..c62effd52 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -16,6 +16,7 @@ package dev import ( + "fmt" "math" "gvisor.dev/gvisor/pkg/context" @@ -90,6 +91,11 @@ func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.In // New returns the root node of a device filesystem. func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + shm, err := tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, nil /* parent */) + if err != nil { + panic(fmt.Sprintf("tmpfs.NewDir failed: %v", err)) + } + contents := map[string]*fs.Inode{ "fd": newSymlink(ctx, "/proc/self/fd", msrc), "stdin": newSymlink(ctx, "/proc/self/fd/0", msrc), @@ -108,7 +114,7 @@ func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { "random": newMemDevice(ctx, newRandomDevice(ctx, fs.RootOwner, 0444), msrc, randomDevMinor), "urandom": newMemDevice(ctx, newRandomDevice(ctx, fs.RootOwner, 0444), msrc, urandomDevMinor), - "shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc), + "shm": shm, // A devpts is typically mounted at /dev/pts to provide // pseudoterminal support. Place an empty directory there for diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index e1e38b498..8ac3738e9 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -155,12 +155,20 @@ func (h *HostMappable) DecRef(fr memmap.FileRange) { // T2: Appends to file causing it to grow // T2: Writes to mapped pages and COW happens // T1: Continues and wronly invalidates the page mapped in step above. -func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error { +func (h *HostMappable) Truncate(ctx context.Context, newSize int64, uattr fs.UnstableAttr) error { h.truncateMu.Lock() defer h.truncateMu.Unlock() mask := fs.AttrMask{Size: true} attr := fs.UnstableAttr{Size: newSize} + + // Truncating a file clears privilege bits. + if uattr.Perms.HasSetUIDOrGID() { + mask.Perms = true + attr.Perms = uattr.Perms + attr.Perms.DropSetUIDAndMaybeGID() + } + if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr, false); err != nil { return err } @@ -193,10 +201,17 @@ func (h *HostMappable) Allocate(ctx context.Context, offset int64, length int64) } // Write writes to the file backing this mappable. -func (h *HostMappable) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { +func (h *HostMappable) Write(ctx context.Context, src usermem.IOSequence, offset int64, uattr fs.UnstableAttr) (int64, error) { h.truncateMu.RLock() + defer h.truncateMu.RUnlock() n, err := src.CopyInTo(ctx, &writer{ctx: ctx, hostMappable: h, off: offset}) - h.truncateMu.RUnlock() + if n > 0 && uattr.Perms.HasSetUIDOrGID() { + mask := fs.AttrMask{Perms: true} + uattr.Perms.DropSetUIDAndMaybeGID() + if err := h.backingFile.SetMaskedAttributes(ctx, mask, uattr, false); err != nil { + return n, err + } + } return n, err } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 7856b354b..855029b84 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -310,6 +310,12 @@ func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, now := ktime.NowFromContext(ctx) masked := fs.AttrMask{Size: true} attr := fs.UnstableAttr{Size: size} + if c.attr.Perms.HasSetUIDOrGID() { + masked.Perms = true + attr.Perms = c.attr.Perms + attr.Perms.DropSetUIDAndMaybeGID() + c.attr.Perms = attr.Perms + } if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr, false); err != nil { c.dataMu.Unlock() return err @@ -685,13 +691,14 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { return done, nil } -// maybeGrowFile grows the file's size if data has been written past the old -// size. +// maybeUpdateAttrs updates the file's attributes after a write. It updates +// size if data has been written past the old size, and setuid/setgid if any +// bytes were written. // // Preconditions: // * rw.c.attrMu must be locked. // * rw.c.dataMu must be locked. -func (rw *inodeReadWriter) maybeGrowFile() { +func (rw *inodeReadWriter) maybeUpdateAttrs(nwritten uint64) { // If the write ends beyond the file's previous size, it causes the // file to grow. if rw.offset > rw.c.attr.Size { @@ -705,6 +712,12 @@ func (rw *inodeReadWriter) maybeGrowFile() { rw.c.attr.Usage = rw.offset rw.c.dirtyAttr.Usage = true } + + // If bytes were written, ensure setuid and setgid are cleared. + if nwritten > 0 && rw.c.attr.Perms.HasSetUIDOrGID() { + rw.c.dirtyAttr.Perms = true + rw.c.attr.Perms.DropSetUIDAndMaybeGID() + } } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. @@ -732,7 +745,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error segMR := seg.Range().Intersect(mr) ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write) if err != nil { - rw.maybeGrowFile() + rw.maybeUpdateAttrs(done) rw.c.dataMu.Unlock() return done, err } @@ -744,7 +757,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error srcs = srcs.DropFirst64(n) rw.c.dirty.MarkDirty(segMR) if err != nil { - rw.maybeGrowFile() + rw.maybeUpdateAttrs(done) rw.c.dataMu.Unlock() return done, err } @@ -765,7 +778,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error srcs = srcs.DropFirst64(n) // Partial writes are fine. But we must stop writing. if n != src.NumBytes() || err != nil { - rw.maybeGrowFile() + rw.maybeUpdateAttrs(done) rw.c.dataMu.Unlock() return done, err } @@ -774,7 +787,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error seg, gap = gap.NextSegment(), FileRangeGapIterator{} } } - rw.maybeGrowFile() + rw.maybeUpdateAttrs(done) rw.c.dataMu.Unlock() return done, nil } diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index c4a069832..94cb05246 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/fd", "//pkg/hostarch", "//pkg/log", + "//pkg/metric", "//pkg/p9", "//pkg/refs", "//pkg/safemem", diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go index 07a564e92..f8b7a60fc 100644 --- a/pkg/sentry/fs/gofer/cache_policy.go +++ b/pkg/sentry/fs/gofer/cache_policy.go @@ -139,7 +139,7 @@ func (cp cachePolicy) revalidate(ctx context.Context, name string, parent, child // Walk from parent to child again. // - // TODO(b/112031682): If we have a directory FD in the parent + // NOTE(b/112031682): If we have a directory FD in the parent // inodeOperations, then we can use fstatat(2) to get the inode // attributes instead of making this RPC. qids, f, mask, attr, err := parentIops.fileState.file.walkGetAttr(ctx, []string{name}) diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 8f5a87120..73d80d9b5 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -91,7 +92,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF } if flags.Write { if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil { - fsmetric.GoferOpensWX.Increment() + metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file") log.Warningf("Opened a writable executable: %q", name) } } @@ -236,10 +237,20 @@ func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.I // and availability of a host-mappable FD. if f.inodeOperations.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) { n, err = f.inodeOperations.cachingInodeOps.Write(ctx, src, offset) - } else if f.inodeOperations.fileState.hostMappable != nil { - n, err = f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset) } else { - n, err = src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset)) + uattr, e := f.UnstableAttr(ctx, file) + if e != nil { + return 0, e + } + if f.inodeOperations.fileState.hostMappable != nil { + n, err = f.inodeOperations.fileState.hostMappable.Write(ctx, src, offset, uattr) + } else { + n, err = src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset)) + if n > 0 && uattr.Perms.HasSetUIDOrGID() { + uattr.Perms.DropSetUIDAndMaybeGID() + f.inodeOperations.SetPermissions(ctx, file.Dirent.Inode, uattr.Perms) + } + } } if n == 0 { diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index b97635ec4..da3178527 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -600,11 +600,25 @@ func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length if i.session().cachePolicy.useCachingInodeOps(inode) { return i.cachingInodeOps.Truncate(ctx, inode, length) } + + uattr, err := i.fileState.unstableAttr(ctx) + if err != nil { + return err + } + if i.session().cachePolicy == cacheRemoteRevalidating { - return i.fileState.hostMappable.Truncate(ctx, length) + return i.fileState.hostMappable.Truncate(ctx, length, uattr) + } + + mask := p9.SetAttrMask{Size: true} + attr := p9.SetAttr{Size: uint64(length)} + if uattr.Perms.HasSetUIDOrGID() { + mask.Permissions = true + uattr.Perms.DropSetUIDAndMaybeGID() + attr.Permissions = p9.FileMode(uattr.Perms.LinuxMode()) } - return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)}) + return i.fileState.file.setAttr(ctx, mask, attr) } // GetXattr implements fs.InodeOperations.GetXattr. diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index 6b3627813..940838a44 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -130,7 +130,16 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string panic(fmt.Sprintf("Create called with unknown or unset open flags: %v", flags)) } + // If the parent directory has setgid enabled, change the new file's owner. owner := fs.FileOwnerFromContext(ctx) + parentUattr, err := dir.UnstableAttr(ctx) + if err != nil { + return nil, err + } + if parentUattr.Perms.SetGID { + owner.GID = parentUattr.Owner.GID + } + hostFile, err := newFile.create(ctx, name, openFlags, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) if err != nil { // Could not create the file. @@ -225,7 +234,18 @@ func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s return syserror.ENAMETOOLONG } + // If the parent directory has setgid enabled, change the new directory's + // owner and enable setgid. owner := fs.FileOwnerFromContext(ctx) + parentUattr, err := dir.UnstableAttr(ctx) + if err != nil { + return err + } + if parentUattr.Perms.SetGID { + owner.GID = parentUattr.Owner.GID + perm.SetGID = true + } + if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { return err } diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index b998fb75d..085aa6d61 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -77,6 +77,27 @@ func (*overcommitMemory) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandl }, 0 } +// +stateify savable +type maxMapCount struct{} + +// NeedsUpdate implements seqfile.SeqSource. +func (*maxMapCount) NeedsUpdate(int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource. +func (*maxMapCount) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + return []seqfile.SeqData{ + { + Buf: []byte("2147483647\n"), + Handle: (*maxMapCount)(nil), + }, + }, 0 +} + func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { h := hostname{ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC), @@ -96,6 +117,7 @@ func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { children := map[string]*fs.Inode{ + "max_map_count": seqfile.NewSeqFileInode(ctx, &maxMapCount{}, msrc), "mmap_min_addr": seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc), "overcommit_memory": seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc), } diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 1d09afdd7..4893af56b 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -403,7 +403,7 @@ type ipForwarding struct { // enabled stores the IPv4 forwarding state on save. // We must save/restore this here, since a netstack instance // is created on restore. - enabled *bool + enabled bool } func newIPForwardingInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { @@ -461,13 +461,8 @@ func (f *ipForwardingFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOS return 0, io.EOF } - if f.ipf.enabled == nil { - enabled := f.stack.Forwarding(ipv4.ProtocolNumber) - f.ipf.enabled = &enabled - } - val := "0\n" - if *f.ipf.enabled { + if f.ipf.enabled { // Technically, this is not quite compatible with Linux. Linux // stores these as an integer, so if you write "2" into // ip_forward, you should get 2 back. @@ -494,11 +489,8 @@ func (f *ipForwardingFile) Write(ctx context.Context, _ *fs.File, src usermem.IO if err != nil { return n, err } - if f.ipf.enabled == nil { - f.ipf.enabled = new(bool) - } - *f.ipf.enabled = v != 0 - return n, f.stack.SetForwarding(ipv4.ProtocolNumber, *f.ipf.enabled) + f.ipf.enabled = v != 0 + return n, f.stack.SetForwarding(ipv4.ProtocolNumber, f.ipf.enabled) } // portRangeInode implements fs.InodeOperations. It provides and allows diff --git a/pkg/sentry/fs/proc/sys_net_state.go b/pkg/sentry/fs/proc/sys_net_state.go index 4cb4741af..51d2be647 100644 --- a/pkg/sentry/fs/proc/sys_net_state.go +++ b/pkg/sentry/fs/proc/sys_net_state.go @@ -47,9 +47,7 @@ func (s *tcpSack) afterLoad() { // afterLoad is invoked by stateify. func (ipf *ipForwarding) afterLoad() { - if ipf.enabled != nil { - if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil { - panic(fmt.Sprintf("failed to set IPv4 forwarding [%v]: %v", *ipf.enabled, err)) - } + if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil { + panic(fmt.Sprintf("ipf.stack.SetForwarding(%d, %t): %s", ipv4.ProtocolNumber, ipf.enabled, err)) } } diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index bc117ca6a..b48d475ed 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -151,5 +151,5 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou } // Construct the tmpfs root. - return NewDir(ctx, nil, owner, perms, msrc), nil + return NewDir(ctx, nil, owner, perms, msrc, nil /* parent */) } diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index f4de8c968..7faa822f0 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -226,6 +226,12 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in now := ktime.NowFromContext(ctx) f.attr.ModificationTime = now f.attr.StatusChangeTime = now + + // Truncating clears privilege bits. + f.attr.Perms.SetUID = false + if f.attr.Perms.Group.Execute { + f.attr.Perms.SetGID = false + } } f.dataMu.Unlock() @@ -363,7 +369,14 @@ func (f *fileInodeOperations) write(ctx context.Context, src usermem.IOSequence, now := ktime.NowFromContext(ctx) f.attr.ModificationTime = now f.attr.StatusChangeTime = now - return src.CopyInTo(ctx, &fileReadWriter{f, offset}) + nwritten, err := src.CopyInTo(ctx, &fileReadWriter{f, offset}) + + // Writing clears privilege bits. + if nwritten > 0 { + f.attr.Perms.DropSetUIDAndMaybeGID() + } + + return nwritten, err } type fileReadWriter struct { diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 577052888..6aa8ff331 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -87,7 +87,20 @@ type Dir struct { var _ fs.InodeOperations = (*Dir)(nil) // NewDir returns a new directory. -func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode { +func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, parent *fs.Inode) (*fs.Inode, error) { + // If the parent has setgid enabled, the new directory enables it and changes + // its GID. + if parent != nil { + parentUattr, err := parent.UnstableAttr(ctx) + if err != nil { + return nil, err + } + if parentUattr.Perms.SetGID { + owner.GID = parentUattr.Owner.GID + perms.SetGID = true + } + } + d := &Dir{ ramfsDir: ramfs.NewDir(ctx, contents, owner, perms), kernel: kernel.KernelFromContext(ctx), @@ -101,7 +114,7 @@ func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwn InodeID: tmpfsDevice.NextIno(), BlockSize: hostarch.PageSize, Type: fs.Directory, - }) + }), nil } // afterLoad is invoked by stateify. @@ -219,11 +232,21 @@ func (d *Dir) SetTimestamps(ctx context.Context, i *fs.Inode, ts fs.TimeSpec) er func (d *Dir) newCreateOps() *ramfs.CreateOps { return &ramfs.CreateOps{ NewDir: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) { - return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil + return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource, dir) }, NewFile: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) { + // If the parent has setgid enabled, change the GID of the new file. + owner := fs.FileOwnerFromContext(ctx) + parentUattr, err := dir.UnstableAttr(ctx) + if err != nil { + return nil, err + } + if parentUattr.Perms.SetGID { + owner.GID = parentUattr.Owner.GID + } + uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{ - Owner: fs.FileOwnerFromContext(ctx), + Owner: owner, Perms: perms, // Always start unlinked. Links: 0, diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go index 12b786224..7f8fa8038 100644 --- a/pkg/sentry/fs/user/user_test.go +++ b/pkg/sentry/fs/user/user_test.go @@ -104,7 +104,10 @@ func TestGetExecUserHome(t *testing.T) { t.Run(name, func(t *testing.T) { ctx := contexttest.Context(t) msrc := fs.NewPseudoMountSource(ctx) - rootInode := tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc) + rootInode, err := tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, nil /* parent */) + if err != nil { + t.Fatalf("tmpfs.NewDir failed: %v", err) + } mns, err := fs.NewMountNamespace(ctx, rootInode) if err != nil { diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go index 0f54888d8..fe9871bdd 100644 --- a/pkg/sentry/fsimpl/cgroupfs/base.go +++ b/pkg/sentry/fsimpl/cgroupfs/base.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -68,11 +67,6 @@ func (c *controllerCommon) Enabled() bool { return true } -// Filesystem implements kernel.CgroupController.Filesystem. -func (c *controllerCommon) Filesystem() *vfs.Filesystem { - return c.fs.VFSFilesystem() -} - // RootCgroup implements kernel.CgroupController.RootCgroup. func (c *controllerCommon) RootCgroup() kernel.Cgroup { return c.fs.rootCgroup() @@ -139,6 +133,17 @@ func (c *cgroupInode) Controllers() []kernel.CgroupController { return c.fs.kcontrollers } +// tasks returns a snapshot of the tasks inside the cgroup. +func (c *cgroupInode) tasks() []*kernel.Task { + c.fs.tasksMu.RLock() + defer c.fs.tasksMu.RUnlock() + ts := make([]*kernel.Task, 0, len(c.ts)) + for t := range c.ts { + ts = append(ts, t) + } + return ts +} + // Enter implements kernel.CgroupImpl.Enter. func (c *cgroupInode) Enter(t *kernel.Task) { c.fs.tasksMu.Lock() @@ -169,10 +174,7 @@ func (d *cgroupProcsData) Generate(ctx context.Context, buf *bytes.Buffer) error pgids := make(map[kernel.ThreadID]struct{}) - d.fs.tasksMu.RLock() - defer d.fs.tasksMu.RUnlock() - - for task := range d.ts { + for _, task := range d.tasks() { // Map dedups pgid, since iterating over all tasks produces multiple // entries for the group leaders. if pgid := currPidns.IDOfThreadGroup(task.ThreadGroup()); pgid != 0 { @@ -211,10 +213,7 @@ func (d *tasksData) Generate(ctx context.Context, buf *bytes.Buffer) error { var pids []kernel.ThreadID - d.fs.tasksMu.RLock() - defer d.fs.tasksMu.RUnlock() - - for task := range d.ts { + for _, task := range d.tasks() { if pid := currPidns.IDOfTask(task); pid != 0 { pids = append(pids, pid) } diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index bd3e69757..05d7eb4ce 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -49,8 +49,9 @@ // // kernel.CgroupRegistry.mu // cgroupfs.filesystem.mu -// Task.mu -// cgroupfs.filesystem.tasksMu. +// kernel.TaskSet.mu +// kernel.Task.mu +// cgroupfs.filesystem.tasksMu. package cgroupfs import ( @@ -109,7 +110,7 @@ type InternalData struct { DefaultControlValues map[string]int64 } -// filesystem implements vfs.FilesystemImpl. +// filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. // // +stateify savable type filesystem struct { @@ -139,6 +140,11 @@ type filesystem struct { tasksMu sync.RWMutex `state:"nosave"` } +// InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID. +func (fs *filesystem) InitializeHierarchyID(hid uint32) { + fs.hierarchyID = hid +} + // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name @@ -284,14 +290,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same // controllers first. - hid, err := r.Register(fs.kcontrollers) - if err != nil { + if err := r.Register(fs.kcontrollers, fs); err != nil { ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) return nil, nil, syserror.EBUSY } - fs.hierarchyID = hid // Move all existing tasks to the root of the new hierarchy. k.PopulateNewCgroupHierarchy(fs.rootCgroup()) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 52879f871..368272f12 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -54,6 +54,7 @@ go_library( "//pkg/fspath", "//pkg/hostarch", "//pkg/log", + "//pkg/metric", "//pkg/p9", "//pkg/refs", "//pkg/refsvfs2", diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 97ce80853..eb09d54c3 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -961,7 +961,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open } return &fd.vfsfd, nil case linux.S_IFLNK: - // Can't open symlinks without O_PATH (which is unimplemented). + // Can't open symlinks without O_PATH, which is handled at the VFS layer. return nil, syserror.ELOOP case linux.S_IFSOCK: if d.isSynthetic() { @@ -1194,11 +1194,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if opts.Flags != 0 { - // Requires 9P support. - return syserror.EINVAL - } - + // Resolve newParent first to verify that it's on this Mount. var ds *[]*dentry fs.renameMu.Lock() defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) @@ -1206,8 +1202,21 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } + + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { + return syserror.EINVAL + } + if fs.opts.interop == InteropModeShared && opts.Flags&linux.RENAME_NOREPLACE != 0 { + // Requires 9P support to synchronize with other remote filesystem + // users. + return syserror.EINVAL + } + newName := rp.Component() if newName == "." || newName == ".." { + if opts.Flags&linux.RENAME_NOREPLACE != 0 { + return syserror.EEXIST + } return syserror.EBUSY } mnt := rp.Mount() @@ -1280,6 +1289,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } var replacedVFSD *vfs.Dentry if replaced != nil { + if opts.Flags&linux.RENAME_NOREPLACE != 0 { + return syserror.EEXIST + } replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 21692d2ac..cf69e1b7a 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1282,9 +1282,12 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) } func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { - // We only support xattrs prefixed with "user." (see b/148380782). Currently, - // there is no need to expose any other xattrs through a gofer. - if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + // Deny access to the "security" and "system" namespaces since applications + // may expect these to affect kernel behavior in unimplemented ways + // (b/148380782). Allow all other extended attributes to be passed through + // to the remote filesystem. This is inconsistent with Linux's 9p client, + // but consistent with other filesystems (e.g. FUSE). + if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) { return syserror.EOPNOTSUPP } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) @@ -1684,7 +1687,7 @@ func (d *dentry) setDeleted() { } func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { - if d.file.isNil() || !d.userXattrSupported() { + if d.file.isNil() { return nil, nil } xattrMap, err := d.file.listXattr(ctx, size) @@ -1693,10 +1696,7 @@ func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size ui } xattrs := make([]string, 0, len(xattrMap)) for x := range xattrMap { - // We only support xattrs in the user.* namespace. - if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { - xattrs = append(xattrs, x) - } + xattrs = append(xattrs, x) } return xattrs, nil } @@ -1731,13 +1731,6 @@ func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name return d.file.removeXattr(ctx, name) } -// Extended attributes in the user.* namespace are only supported for regular -// files and directories. -func (d *dentry) userXattrSupported() bool { - filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType() - return filetype == linux.ModeRegular || filetype == linux.ModeDirectory -} - // Preconditions: // * !d.isSynthetic(). // * d.isRegularFile() || d.isDir(). diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index f0e7bbaf7..eed05e369 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -59,7 +60,7 @@ func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, return nil, err } if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) { - fsmetric.GoferOpensWX.Increment() + metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file") } if atomic.LoadInt32(&d.mmapFD) >= 0 { fsmetric.GoferOpensHost.Increment() diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index ac3b5b621..c12444b7e 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsmetric" @@ -100,7 +101,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci d.fs.specialFileFDs[fd] = struct{}{} d.fs.syncMu.Unlock() if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) { - fsmetric.GoferOpensWX.Increment() + metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file") } if h.fd >= 0 { fsmetric.GoferOpensHost.Increment() diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index f50b0fb08..8fac53c60 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -635,12 +635,6 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - // Only RENAME_NOREPLACE is supported. - if opts.Flags&^linux.RENAME_NOREPLACE != 0 { - return syserror.EINVAL - } - noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 - fs.mu.Lock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() @@ -651,6 +645,13 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } + + // Only RENAME_NOREPLACE is supported. + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { + return syserror.EINVAL + } + noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 + mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 46c500427..6b6fa0bd5 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -1017,10 +1017,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if opts.Flags != 0 { - return syserror.EINVAL - } - + // Resolve newParent first to verify that it's on this Mount. var ds *[]*dentry fs.renameMu.Lock() defer fs.renameMuUnlockAndCheckDrop(ctx, &ds) @@ -1028,8 +1025,16 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } + + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { + return syserror.EINVAL + } + newName := rp.Component() if newName == "." || newName == ".." { + if opts.Flags&linux.RENAME_NOREPLACE != 0 { + return syserror.EEXIST + } return syserror.EBUSY } mnt := rp.Mount() @@ -1093,6 +1098,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return err } if replaced != nil { + if opts.Flags&linux.RENAME_NOREPLACE != 0 { + return syserror.EEXIST + } replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go index 43bfd69a3..82491a0f8 100644 --- a/pkg/sentry/fsimpl/overlay/regular_file.go +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -207,9 +207,10 @@ func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) e return err } - // Changing owners may clear one or both of the setuid and setgid bits, - // so we may have to update opts before setting d.mode. - if opts.Stat.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { + // Changing owners or truncating may clear one or both of the setuid and + // setgid bits, so we may have to update opts before setting d.mode. + inotifyMask := opts.Stat.Mask + if opts.Stat.Mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 { stat, err := wrappedFD.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_MODE, }) @@ -218,10 +219,14 @@ func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) e } opts.Stat.Mode = stat.Mode opts.Stat.Mask |= linux.STATX_MODE + // Don't generate inotify IN_ATTRIB for size-only changes (truncations). + if opts.Stat.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { + inotifyMask |= linux.STATX_MODE + } } d.updateAfterSetStatLocked(&opts) - if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + if ev := vfs.InotifyEventFromStatMask(inotifyMask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 9b14dd6b9..2bc98a94f 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -55,6 +55,7 @@ func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k * }), }), "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{ + "max_map_count": fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")), "mmap_min_addr": fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}), "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")), }), @@ -365,27 +366,22 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error { } // ipForwarding implements vfs.WritableDynamicBytesSource for -// /proc/sys/net/ipv4/ip_forwarding. +// /proc/sys/net/ipv4/ip_forward. // // +stateify savable type ipForwarding struct { kernfs.DynamicBytesFile stack inet.Stack `state:"wait"` - enabled *bool + enabled bool } var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error { - if ipf.enabled == nil { - enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber) - ipf.enabled = &enabled - } - val := "0\n" - if *ipf.enabled { + if ipf.enabled { // Technically, this is not quite compatible with Linux. Linux stores these // as an integer, so if you write "2" into tcp_sack, you should get 2 back. // Tough luck. @@ -414,11 +410,8 @@ func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offs if err != nil { return 0, err } - if ipf.enabled == nil { - ipf.enabled = new(bool) - } - *ipf.enabled = v != 0 - if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil { + ipf.enabled = v != 0 + if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil { return 0, err } return n, nil diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go index 6cee22823..19b012f7d 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -132,7 +132,7 @@ func TestConfigureIPForwarding(t *testing.T) { t.Run(c.comment, func(t *testing.T) { s.IPForwarding = c.initial - file := &ipForwarding{stack: s, enabled: &c.initial} + file := &ipForwarding{stack: s, enabled: c.initial} // Write the values. src := usermem.BytesIOSequence([]byte(c.str)) diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index c766164c7..b3f9d1010 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -17,7 +17,6 @@ go_library( "//pkg/fspath", "//pkg/hostarch", "//pkg/memutil", - "//pkg/metric", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 33e52ce64..97aa20cd1 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -63,8 +62,6 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("creating platform: %v", err) } - metric.CreateSentryMetrics() - kernel.VFS2Enabled = true k := &kernel.Kernel{ Platform: plat, @@ -88,6 +85,7 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("creating timekeeper: %v", err) } tk.SetClocks(time.NewCalibratedClocks()) + k.SetTimekeeper(tk) creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) @@ -96,7 +94,6 @@ func Boot() (*kernel.Kernel, error) { if err = k.Init(kernel.InitKernelArgs{ ApplicationCores: uint(runtime.GOMAXPROCS(-1)), FeatureSet: cpuid.HostFeatureSet(), - Timekeeper: tk, RootUserNamespace: creds.UserNamespace, Vdso: vdso, RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), @@ -181,7 +178,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { memfile := os.NewFile(uintptr(memfd), memfileName) mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { - memfile.Close() + _ = memfile.Close() return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) } return mf, nil diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5fdca1d46..f0f4297ef 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -465,7 +465,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open } return &fd.vfsfd, nil case *symlink: - // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH. + // Can't open symlinks without O_PATH, which is handled at the VFS layer. return nil, syserror.ELOOP case *namedPipe: return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) @@ -496,20 +496,24 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if opts.Flags != 0 { - // TODO(b/145974740): Support renameat2 flags. - return syserror.EINVAL - } - - // Resolve newParent first to verify that it's on this Mount. + // Resolve newParentDir first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } + + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { + // TODO(b/145974740): Support other renameat2 flags. + return syserror.EINVAL + } + newName := rp.Component() if newName == "." || newName == ".." { + if opts.Flags&linux.RENAME_NOREPLACE != 0 { + return syserror.EEXIST + } return syserror.EBUSY } mnt := rp.Mount() @@ -556,6 +560,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } replaced, ok := newParentDir.childMap[newName] if ok { + if opts.Flags&linux.RENAME_NOREPLACE != 0 { + return syserror.EEXIST + } replacedDir, ok := replaced.inode.impl.(*directory) if ok { if !renamed.inode.isDir() { @@ -815,7 +822,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si if err != nil { return nil, err } - return d.inode.listXattr(size) + return d.inode.listXattr(rp.Credentials(), size) } // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 9ae25ce9e..6b4367c42 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -717,44 +717,63 @@ func (i *inode) touchCMtimeLocked() { atomic.StoreInt64(&i.ctime, now) } -func (i *inode) listXattr(size uint64) ([]string, error) { - return i.xattrs.ListXattr(size) +func checkXattrName(name string) error { + // Linux's tmpfs supports "security" and "trusted" xattr namespaces, and + // (depending on build configuration) POSIX ACL xattr namespaces + // ("system.posix_acl_access" and "system.posix_acl_default"). We don't + // support POSIX ACLs or the "security" namespace (b/148380782). + if strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { + return nil + } + // We support the "user" namespace because we have tests that depend on + // this feature. + if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return nil + } + return syserror.EOPNOTSUPP +} + +func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { + return i.xattrs.ListXattr(creds, size) } func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { - if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { + if err := checkXattrName(opts.Name); err != nil { return "", err } - return i.xattrs.GetXattr(opts) + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + kuid := auth.KUID(atomic.LoadUint32(&i.uid)) + kgid := auth.KGID(atomic.LoadUint32(&i.gid)) + if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil { + return "", err + } + return i.xattrs.GetXattr(creds, mode, kuid, opts) } func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { - if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { + if err := checkXattrName(opts.Name); err != nil { return err } - return i.xattrs.SetXattr(opts) -} - -func (i *inode) removeXattr(creds *auth.Credentials, name string) error { - if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + kuid := auth.KUID(atomic.LoadUint32(&i.uid)) + kgid := auth.KGID(atomic.LoadUint32(&i.gid)) + if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { return err } - return i.xattrs.RemoveXattr(name) + return i.xattrs.SetXattr(creds, mode, kuid, opts) } -func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { - // We currently only support extended attributes in the user.* and - // trusted.* namespaces. See b/148380782. - if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { - return syserror.EOPNOTSUPP +func (i *inode) removeXattr(creds *auth.Credentials, name string) error { + if err := checkXattrName(name); err != nil { + return err } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) kuid := auth.KUID(atomic.LoadUint32(&i.uid)) kgid := auth.KGID(atomic.LoadUint32(&i.gid)) - if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { return err } - return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) + return i.xattrs.RemoveXattr(creds, mode, kuid, name) } // fileDescription is embedded by tmpfs implementations of @@ -807,7 +826,7 @@ func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // ListXattr implements vfs.FileDescriptionImpl.ListXattr. func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { - return fd.inode().listXattr(size) + return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size) } // GetXattr implements vfs.FileDescriptionImpl.GetXattr. diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 31d34ef60..969003613 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -868,6 +868,10 @@ func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCa fd.mu.Lock() defer fd.mu.Unlock() + if _, err := fd.lowerFD.Seek(ctx, fd.off, linux.SEEK_SET); err != nil { + return err + } + var ds []vfs.Dirent err := fd.lowerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { // Do not include the Merkle tree files. @@ -890,8 +894,8 @@ func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCa return err } - // The result should contain all children plus "." and "..". - if fd.d.verityEnabled() && len(ds) != len(fd.d.childrenNames)+2 { + // The result should be a part of all children plus "." and "..", counting from fd.off. + if fd.d.verityEnabled() && len(ds) != len(fd.d.childrenNames)+2-int(fd.off) { return fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Unexpected children number %d", len(ds))) } @@ -1299,6 +1303,11 @@ func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapO return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) } +// SupportsLocks implements vfs.FileDescriptionImpl.SupportsLocks. +func (fd *fileDescription) SupportsLocks() bool { + return fd.lowerFD.SupportsLocks() +} + // LockBSD implements vfs.FileDescriptionImpl.LockBSD. func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { return fd.lowerFD.LockBSD(ctx, ownerPID, t, block) diff --git a/pkg/sentry/fsmetric/fsmetric.go b/pkg/sentry/fsmetric/fsmetric.go index 7e535b527..17d0d5025 100644 --- a/pkg/sentry/fsmetric/fsmetric.go +++ b/pkg/sentry/fsmetric/fsmetric.go @@ -42,7 +42,6 @@ var ( // Metrics that only apply to fs/gofer and fsimpl/gofer. var ( - GoferOpensWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a executable file was opened writably from a gofer.") GoferOpens9P = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a file was opened from a gofer and did not have a host file descriptor.") GoferOpensHost = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a file was opened from a gofer and did have a host file descriptor.") GoferReads9P = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.") diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 6b71bd3a9..80dda1559 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -88,9 +88,6 @@ type Stack interface { // for restoring a stack after a save. RestoreCleanupEndpoints([]stack.TransportEndpoint) - // Forwarding returns if packet forwarding between NICs is enabled. - Forwarding(protocol tcpip.NetworkProtocolNumber) bool - // SetForwarding enables or disables packet forwarding between NICs. SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index 03e2608c2..218d9dafc 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -154,11 +154,6 @@ func (s *TestStack) CleanupEndpoints() []stack.TransportEndpoint { // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. func (s *TestStack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} -// Forwarding implements inet.Stack.Forwarding. -func (s *TestStack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { - return s.IPForwarding -} - // SetForwarding implements inet.Stack.SetForwarding. func (s *TestStack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { s.IPForwarding = enable diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index a1ec6daab..188c0ebff 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -32,7 +32,7 @@ go_template_instance( out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", - template = "//pkg/sync:generic_seqatomic", + template = "//pkg/sync/seqatomic:generic_seqatomic", types = { "Value": "TaskGoroutineSchedInfo", }, diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 869e49ebc..12180351d 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -8,7 +8,7 @@ go_template_instance( out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", - template = "//pkg/sync:generic_atomicptr", + template = "//pkg/sync/atomicptr:generic_atomicptr", types = { "Value": "Credentials", }, diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index 6862f2ef5..3325fedcb 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -125,7 +125,7 @@ func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *T creds.EffectiveCaps = capabilities.EffectiveCaps creds.BoundingCaps = capabilities.BoundingCaps creds.InheritableCaps = capabilities.InheritableCaps - // TODO(nlacasse): Support ambient capabilities. + // TODO(gvisor.dev/issue/3166): Support ambient capabilities. } else { // If no capabilities are specified, grant capabilities consistent with // setresuid + setresgid from NewRootCredentials to the given uid and diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go index 1f1c63f37..c93ef6ac1 100644 --- a/pkg/sentry/kernel/cgroup.go +++ b/pkg/sentry/kernel/cgroup.go @@ -48,10 +48,6 @@ type CgroupController interface { // attached to. Returned value is valid for the lifetime of the controller. HierarchyID() uint32 - // Filesystem returns the filesystem this controller is attached to. - // Returned value is valid for the lifetime of the controller. - Filesystem() *vfs.Filesystem - // RootCgroup returns the root cgroup for this controller. Returned value is // valid for the lifetime of the controller. RootCgroup() Cgroup @@ -124,6 +120,19 @@ func (h *hierarchy) match(ctypes []CgroupControllerType) bool { return true } +// cgroupFS is the public interface to cgroupfs. This lets the kernel package +// refer to cgroupfs.filesystem methods without directly depending on the +// cgroupfs package, which would lead to a circular dependency. +type cgroupFS interface { + // Returns the vfs.Filesystem for the cgroupfs. + VFSFilesystem() *vfs.Filesystem + + // InitializeHierarchyID sets the hierarchy ID for this filesystem during + // filesystem creation. May only be called before the filesystem is visible + // to the vfs layer. + InitializeHierarchyID(hid uint32) +} + // CgroupRegistry tracks the active set of cgroup controllers on the system. // // +stateify savable @@ -172,7 +181,23 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files for _, h := range r.hierarchies { if h.match(ctypes) { - h.fs.IncRef() + if !h.fs.TryIncRef() { + // Racing with filesystem destruction, namely h.fs.Release. + // Since we hold r.mu, we know the hierarchy hasn't been + // unregistered yet, but its associated filesystem is tearing + // down. + // + // If we simply indicate the hierarchy wasn't found without + // cleaning up the registry, the caller can race with the + // unregister and find itself temporarily unable to create a new + // hierarchy with a subset of the relevant controllers. + // + // To keep the result of FindHierarchy consistent with the + // uniqueness of controllers enforced by Register, drop the + // dying hierarchy now. The eventual unregister by the FS + // teardown will become a no-op. + return nil + } return h.fs } } @@ -182,31 +207,35 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files // Register registers the provided set of controllers with the registry as a new // hierarchy. If any controller is already registered, the function returns an -// error without modifying the registry. The hierarchy can be later referenced -// by the returned id. -func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) { +// error without modifying the registry. Register sets the hierarchy ID for the +// filesystem on success. +func (r *CgroupRegistry) Register(cs []CgroupController, fs cgroupFS) error { r.mu.Lock() defer r.mu.Unlock() if len(cs) == 0 { - return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers") + return fmt.Errorf("can't register hierarchy with no controllers") } for _, c := range cs { if _, ok := r.controllers[c.Type()]; ok { - return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy") + return fmt.Errorf("controllers may only be mounted on a single hierarchy") } } hid, err := r.nextHierarchyID() if err != nil { - return hid, err + return err } + // Must not fail below here, once we publish the hierarchy ID. + + fs.InitializeHierarchyID(hid) + h := hierarchy{ id: hid, controllers: make(map[CgroupControllerType]CgroupController), - fs: cs[0].Filesystem(), + fs: fs.VFSFilesystem(), } for _, c := range cs { n := c.Type() @@ -214,15 +243,20 @@ func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) { h.controllers[n] = c } r.hierarchies[hid] = h - return hid, nil + return nil } -// Unregister removes a previously registered hierarchy from the registry. If -// the controller was not previously registered, Unregister is a no-op. +// Unregister removes a previously registered hierarchy from the registry. If no +// such hierarchy is registered, Unregister is a no-op. func (r *CgroupRegistry) Unregister(hid uint32) { r.mu.Lock() - defer r.mu.Unlock() + r.unregisterLocked(hid) + r.mu.Unlock() +} +// Precondition: Caller must hold r.mu. +// +checklocks:r.mu +func (r *CgroupRegistry) unregisterLocked(hid uint32) { if h, ok := r.hierarchies[hid]; ok { for name, _ := range h.controllers { delete(r.controllers, name) @@ -253,6 +287,11 @@ func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[C for name, ctl := range r.controllers { if _, ok := ctlSet[name]; !ok { cg := ctl.RootCgroup() + // Multiple controllers may share the same hierarchy, so may have + // the same root cgroup. Grab a single ref per hierarchy root. + if _, ok := cgset[cg]; ok { + continue + } cg.IncRef() // Ref transferred to caller. cgset[cg] = struct{}{} } diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index f855f038b..6224a0cbd 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -8,7 +8,6 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index dbbbaeeb0..5d584dc45 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -17,7 +17,6 @@ package fasync import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -125,9 +124,9 @@ func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) { if !permCheck { return } - signalInfo := &arch.SignalInfo{ + signalInfo := &linux.SignalInfo{ Signo: int32(linux.SIGIO), - Code: arch.SignalInfoKernel, + Code: linux.SI_KERNEL, } if a.signal != 0 { signalInfo.Signo = int32(a.signal) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 10885688c..62777faa8 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -154,9 +154,11 @@ func (f *FDTable) drop(ctx context.Context, file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { // Release any POSIX lock possibly held by the FDTable. - err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) - if err != nil && err != syserror.ENOLCK { - panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + if file.SupportsLocks() { + err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) + if err != nil && err != syserror.ENOLCK { + panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + } } // Drop the table's reference. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index a75686cf3..6c31e082c 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -8,7 +8,7 @@ go_template_instance( out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", - template = "//pkg/sync:generic_atomicptr", + template = "//pkg/sync/atomicptr:generic_atomicptr", types = { "Value": "bucket", }, diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index e6e9da898..d537e608a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -306,9 +306,6 @@ type InitKernelArgs struct { // FeatureSet is the emulated CPU feature set. FeatureSet *cpuid.FeatureSet - // Timekeeper manages time for all tasks in the system. - Timekeeper *Timekeeper - // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace @@ -348,29 +345,34 @@ type InitKernelArgs struct { PIDNamespace *PIDNamespace } +// SetTimekeeper sets Kernel.timekeeper. SetTimekeeper must be called before +// Init. +func (k *Kernel) SetTimekeeper(tk *Timekeeper) { + k.timekeeper = tk +} + // Init initialize the Kernel with no tasks. // // Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile -// before calling Init. +// and Kernel.SetTimekeeper before calling Init. func (k *Kernel) Init(args InitKernelArgs) error { if args.FeatureSet == nil { - return fmt.Errorf("FeatureSet is nil") + return fmt.Errorf("args.FeatureSet is nil") } - if args.Timekeeper == nil { - return fmt.Errorf("Timekeeper is nil") + if k.timekeeper == nil { + return fmt.Errorf("timekeeper is nil") } - if args.Timekeeper.clocks == nil { + if k.timekeeper.clocks == nil { return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") } if args.RootUserNamespace == nil { - return fmt.Errorf("RootUserNamespace is nil") + return fmt.Errorf("args.RootUserNamespace is nil") } if args.ApplicationCores == 0 { - return fmt.Errorf("ApplicationCores is 0") + return fmt.Errorf("args.ApplicationCores is 0") } k.featureSet = args.FeatureSet - k.timekeeper = args.Timekeeper k.tasks = newTaskSet(args.PIDNamespace) k.rootUserNamespace = args.RootUserNamespace k.rootUTSNamespace = args.RootUTSNamespace @@ -395,8 +397,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.extraAuxv = args.ExtraAuxv k.vdso = args.Vdso - k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime} - k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} + k.realtimeClock = &timekeeperClock{tk: k.timekeeper, c: sentrytime.Realtime} + k.monotonicClock = &timekeeperClock{tk: k.timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() k.ptraceExceptions = make(map[*Task]*Task) @@ -654,12 +656,12 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { defer k.tasks.mu.RUnlock() for t := range k.tasks.Root.tids { // We can skip locking Task.mu here since the kernel is paused. - if mm := t.image.MemoryManager; mm != nil { - if _, ok := invalidated[mm]; !ok { - if err := mm.InvalidateUnsavable(ctx); err != nil { + if memMgr := t.image.MemoryManager; memMgr != nil { + if _, ok := invalidated[memMgr]; !ok { + if err := memMgr.InvalidateUnsavable(ctx); err != nil { return err } - invalidated[mm] = struct{}{} + invalidated[memMgr] = struct{}{} } } // I really wish we just had a sync.Map of all MMs... @@ -1339,7 +1341,7 @@ func (k *Kernel) Unpause() { // context is used only for debugging to describe how the signal was received. // // Preconditions: Kernel must have an init process. -func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { +func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) { k.extMu.Lock() defer k.extMu.Unlock() k.sendExternalSignal(info, context) @@ -1347,7 +1349,7 @@ func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) { // SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup. // This function doesn't skip signals like SendExternalSignal does. -func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.SignalInfo) error { +func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error { k.extMu.Lock() defer k.extMu.Unlock() return tg.SendSignal(info) @@ -1355,7 +1357,7 @@ func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *arch.Signa // SendContainerSignal sends the given signal to all processes inside the // namespace that match the given container ID. -func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error { +func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error { k.extMu.Lock() defer k.extMu.Unlock() k.tasks.mu.RLock() @@ -1553,22 +1555,23 @@ func (k *Kernel) SetSaveError(err error) { var _ tcpip.Clock = (*Kernel)(nil) -// NowNanoseconds implements tcpip.Clock.NowNanoseconds. -func (k *Kernel) NowNanoseconds() int64 { - now, err := k.timekeeper.GetTime(sentrytime.Realtime) +// Now implements tcpip.Clock.NowNanoseconds. +func (k *Kernel) Now() time.Time { + nsec, err := k.timekeeper.GetTime(sentrytime.Realtime) if err != nil { - panic("Kernel.NowNanoseconds: " + err.Error()) + panic("timekeeper.GetTime(sentrytime.Realtime): " + err.Error()) } - return now + return time.Unix(0, nsec) } // NowMonotonic implements tcpip.Clock.NowMonotonic. -func (k *Kernel) NowMonotonic() int64 { - now, err := k.timekeeper.GetTime(sentrytime.Monotonic) +func (k *Kernel) NowMonotonic() tcpip.MonotonicTime { + nsec, err := k.timekeeper.GetTime(sentrytime.Monotonic) if err != nil { - panic("Kernel.NowMonotonic: " + err.Error()) + panic("timekeeper.GetTime(sentrytime.Monotonic): " + err.Error()) } - return now + var mt tcpip.MonotonicTime + return mt.Add(time.Duration(nsec) * time.Nanosecond) } // AfterFunc implements tcpip.Clock.AfterFunc. @@ -1783,7 +1786,7 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { }) t := TaskFromContext(ctx) - k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ + _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ Tid: int32(t.ThreadID()), Registers: t.Arch().StateData().Proto(), }) @@ -1858,7 +1861,9 @@ func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) { return } t.mu.Lock() - t.enterCgroupLocked(root) + // A task can be in the cgroup if it has been created after the + // cgroup hierarchy was registered. + t.enterCgroupIfNotYetLocked(root) t.mu.Unlock() }) k.tasks.mu.RUnlock() @@ -1874,7 +1879,7 @@ func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) { return } t.mu.Lock() - for cg, _ := range t.cgroups { + for cg := range t.cgroups { if cg.HierarchyID() == hid { t.leaveCgroupLocked(cg) } diff --git a/pkg/sentry/kernel/pending_signals.go b/pkg/sentry/kernel/pending_signals.go index 77a35b788..af455c434 100644 --- a/pkg/sentry/kernel/pending_signals.go +++ b/pkg/sentry/kernel/pending_signals.go @@ -17,7 +17,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" - "gvisor.dev/gvisor/pkg/sentry/arch" ) const ( @@ -65,7 +64,7 @@ type pendingSignalQueue struct { type pendingSignal struct { // pendingSignalEntry links into a pendingSignalList. pendingSignalEntry - *arch.SignalInfo + *linux.SignalInfo // If timer is not nil, it is the IntervalTimer which sent this signal. timer *IntervalTimer @@ -75,7 +74,7 @@ type pendingSignal struct { // on failure (if the given signal's queue is full). // // Preconditions: info represents a valid signal. -func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bool { +func (p *pendingSignals) enqueue(info *linux.SignalInfo, timer *IntervalTimer) bool { sig := linux.Signal(info.Signo) q := &p.signals[sig.Index()] if sig.IsStandard() { @@ -93,7 +92,7 @@ func (p *pendingSignals) enqueue(info *arch.SignalInfo, timer *IntervalTimer) bo // dequeue dequeues and returns any pending signal not masked by mask. If no // unmasked signals are pending, dequeue returns nil. -func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo { +func (p *pendingSignals) dequeue(mask linux.SignalSet) *linux.SignalInfo { // "Real-time signals are delivered in a guaranteed order. Multiple // real-time signals of the same type are delivered in the order they were // sent. If different real-time signals are sent to a process, they are @@ -111,7 +110,7 @@ func (p *pendingSignals) dequeue(mask linux.SignalSet) *arch.SignalInfo { return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1)) } -func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *arch.SignalInfo { +func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *linux.SignalInfo { q := &p.signals[sig.Index()] ps := q.pendingSignalList.Front() if ps == nil { diff --git a/pkg/sentry/kernel/pending_signals_state.go b/pkg/sentry/kernel/pending_signals_state.go index ca8b4e164..e77f1a254 100644 --- a/pkg/sentry/kernel/pending_signals_state.go +++ b/pkg/sentry/kernel/pending_signals_state.go @@ -14,13 +14,11 @@ package kernel -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" -) +import "gvisor.dev/gvisor/pkg/abi/linux" // +stateify savable type savedPendingSignal struct { - si *arch.SignalInfo + si *linux.SignalInfo timer *IntervalTimer } diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 2d89b9ccd..3fa5d1d2f 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -86,6 +86,12 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) if n > 0 { p.Notify(waiter.ReadableEvents) } + if err == unix.EPIPE { + // If we are returning EPIPE send SIGPIPE to the task. + if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil { + sendSig(linux.SIGPIPE) + } + } return n, err } @@ -129,7 +135,7 @@ func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgume v = math.MaxInt32 // Silently truncate. } // Copy result to userspace. - iocc := primitive.IOCopyContext{ + iocc := usermem.IOCopyContext{ IO: io, Ctx: ctx, Opts: usermem.IOOpts{ diff --git a/pkg/sentry/kernel/posixtimer.go b/pkg/sentry/kernel/posixtimer.go index 2e861a5a8..d801a3d83 100644 --- a/pkg/sentry/kernel/posixtimer.go +++ b/pkg/sentry/kernel/posixtimer.go @@ -18,7 +18,6 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -97,7 +96,7 @@ func (it *IntervalTimer) ResumeTimer() { } // Preconditions: it.target's signal mutex must be locked. -func (it *IntervalTimer) updateDequeuedSignalLocked(si *arch.SignalInfo) { +func (it *IntervalTimer) updateDequeuedSignalLocked(si *linux.SignalInfo) { it.sigpending = false if it.sigorphan { return @@ -138,9 +137,9 @@ func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Settin it.sigpending = true it.sigorphan = false it.overrunCur += exp - 1 - si := &arch.SignalInfo{ + si := &linux.SignalInfo{ Signo: int32(it.signo), - Code: arch.SignalInfoTimer, + Code: linux.SI_TIMER, } si.SetTimerID(it.id) si.SetSigval(it.sigval) diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 57c7659e7..a6287fd6a 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -394,7 +393,7 @@ func (t *Task) ptraceTrapLocked(code int32) { t.trapStopPending = false t.tg.signalHandlers.mu.Unlock() t.ptraceCode = code - t.ptraceSiginfo = &arch.SignalInfo{ + t.ptraceSiginfo = &linux.SignalInfo{ Signo: int32(linux.SIGTRAP), Code: code, } @@ -402,7 +401,7 @@ func (t *Task) ptraceTrapLocked(code int32) { t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) if t.beginPtraceStopLocked() { tracer := t.Tracer() - tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) + tracer.signalStop(t, linux.CLD_TRAPPED, int32(linux.SIGTRAP)) tracer.tg.eventQueue.Notify(EventTraceeStop) } } @@ -542,9 +541,9 @@ func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - // ptrace(2) if !seize { - target.sendSignalLocked(&arch.SignalInfo{ + target.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGSTOP), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }, false /* group */) } // Undocumented Linux feature: If the tracee is already group-stopped (and @@ -586,7 +585,7 @@ func (t *Task) exitPtrace() { for target := range t.ptraceTracees { if target.ptraceOpts.ExitKill { target.tg.signalHandlers.mu.Lock() - target.sendSignalLocked(&arch.SignalInfo{ + target.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGKILL), }, false /* group */) target.tg.signalHandlers.mu.Unlock() @@ -652,7 +651,7 @@ func (t *Task) forgetTracerLocked() { // Preconditions: // * The signal mutex must be locked. // * The caller must be running on the task goroutine. -func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { +func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false } @@ -678,7 +677,7 @@ func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { t.ptraceSiginfo = info t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) if t.beginPtraceStopLocked() { - tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo) + tracer.signalStop(t, linux.CLD_TRAPPED, info.Signo) tracer.tg.eventQueue.Notify(EventTraceeStop) } return true @@ -829,7 +828,7 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions if child.ptraceSeized { child.trapStopPending = true } else { - child.pendingSignals.enqueue(&arch.SignalInfo{ + child.pendingSignals.enqueue(&linux.SignalInfo{ Signo: int32(linux.SIGSTOP), }, nil) } @@ -893,9 +892,9 @@ func (t *Task) ptraceExec(oldTID ThreadID) { } t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() - t.sendSignalLocked(&arch.SignalInfo{ + t.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGTRAP), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }, false /* group */) } @@ -1228,7 +1227,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { return err case linux.PTRACE_SETSIGINFO: - var info arch.SignalInfo + var info linux.SignalInfo if _, err := info.CopyIn(t, data); err != nil { return err } diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index a95e174a2..54ca43c2e 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -39,11 +39,11 @@ func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input { } } -func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *arch.SignalInfo { - si := &arch.SignalInfo{ +func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo { + si := &linux.SignalInfo{ Signo: int32(linux.SIGSYS), Errno: errno, - Code: arch.SYS_SECCOMP, + Code: linux.SYS_SECCOMP, } si.SetCallAddr(uint64(ip)) si.SetSyscall(sysno) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index fe2ab1662..47bb66b42 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -35,10 +35,10 @@ const ( // Maximum number of semaphore sets. setsMax = linux.SEMMNI - // Maximum number of semaphroes in a semaphore set. + // Maximum number of semaphores in a semaphore set. semsMax = linux.SEMMSL - // Maximum number of semaphores in all semaphroe sets. + // Maximum number of semaphores in all semaphore sets. semsTotalMax = linux.SEMMNS ) @@ -171,10 +171,10 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu // Map semaphores and map indexes in a registry are of the same size, // check map semaphores only here for the system limit. if len(r.semaphores) >= setsMax { - return nil, syserror.EINVAL + return nil, syserror.ENOSPC } if r.totalSems() > int(semsTotalMax-nsems) { - return nil, syserror.EINVAL + return nil, syserror.ENOSPC } // Finally create a new set. @@ -220,7 +220,7 @@ func (r *Registry) HighestIndex() int32 { defer r.mu.Unlock() // By default, highest used index is 0 even though - // there is no semaphroe set. + // there is no semaphore set. var highestIndex int32 for index := range r.indexes { if index > highestIndex { @@ -702,7 +702,9 @@ func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool { return s.checkCapability(creds) } -// destroy destroys the set. Caller must hold 's.mu'. +// destroy destroys the set. +// +// Preconditions: Caller must hold 's.mu'. func (s *Set) destroy() { // Notify all waiters. They will fail on the next attempt to execute // operations and return error. diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 0cd9e2533..ca9076406 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,7 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" ) @@ -233,7 +232,7 @@ func (pg *ProcessGroup) Session() *Session { // SendSignal sends a signal to all processes inside the process group. It is // analagous to kernel/signal.c:kill_pgrp. -func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error { +func (pg *ProcessGroup) SendSignal(info *linux.SignalInfo) error { tasks := pg.originator.TaskSet() tasks.mu.RLock() defer tasks.mu.RUnlock() diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go index 2488ae7d5..e08474d25 100644 --- a/pkg/sentry/kernel/signal.go +++ b/pkg/sentry/kernel/signal.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" ) @@ -36,7 +35,7 @@ const SignalPanic = linux.SIGUSR2 // context is used only for debugging to differentiate these cases. // // Preconditions: Kernel must have an init process. -func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) { +func (k *Kernel) sendExternalSignal(info *linux.SignalInfo, context string) { switch linux.Signal(info.Signo) { case linux.SIGURG: // Sent by the Go 1.14+ runtime for asynchronous goroutine preemption. @@ -60,18 +59,18 @@ func (k *Kernel) sendExternalSignal(info *arch.SignalInfo, context string) { } // SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV. -func SignalInfoPriv(sig linux.Signal) *arch.SignalInfo { - return &arch.SignalInfo{ +func SignalInfoPriv(sig linux.Signal) *linux.SignalInfo { + return &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoKernel, + Code: linux.SI_KERNEL, } } // SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO. -func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo { - info := &arch.SignalInfo{ +func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *linux.SignalInfo { + info := &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, } info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg))) info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go index 768fda220..147cc41bb 100644 --- a/pkg/sentry/kernel/signal_handlers.go +++ b/pkg/sentry/kernel/signal_handlers.go @@ -16,7 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" ) @@ -30,14 +29,14 @@ type SignalHandlers struct { mu sync.Mutex `state:"nosave"` // actions is the action to be taken upon receiving each signal. - actions map[linux.Signal]arch.SignalAct + actions map[linux.Signal]linux.SigAction } // NewSignalHandlers returns a new SignalHandlers specifying all default // actions. func NewSignalHandlers() *SignalHandlers { return &SignalHandlers{ - actions: make(map[linux.Signal]arch.SignalAct), + actions: make(map[linux.Signal]linux.SigAction), } } @@ -59,9 +58,9 @@ func (sh *SignalHandlers) CopyForExec() *SignalHandlers { sh.mu.Lock() defer sh.mu.Unlock() for sig, act := range sh.actions { - if act.Handler == arch.SignalActIgnore { - sh2.actions[sig] = arch.SignalAct{ - Handler: arch.SignalActIgnore, + if act.Handler == linux.SIG_IGN { + sh2.actions[sig] = linux.SigAction{ + Handler: linux.SIG_IGN, } } } @@ -73,15 +72,15 @@ func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool { sh.mu.Lock() defer sh.mu.Unlock() sa, ok := sh.actions[sig] - return ok && sa.Handler == arch.SignalActIgnore + return ok && sa.Handler == linux.SIG_IGN } // dequeueActionLocked returns the SignalAct that should be used to handle sig. // // Preconditions: sh.mu must be locked. -func (sh *SignalHandlers) dequeueAction(sig linux.Signal) arch.SignalAct { +func (sh *SignalHandlers) dequeueAction(sig linux.Signal) linux.SigAction { act := sh.actions[sig] - if act.IsResetHandler() { + if act.Flags&linux.SA_RESETHAND != 0 { delete(sh.actions, sig) } return act diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index be1371855..2e3b4488a 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -151,7 +150,7 @@ type Task struct { // which the SA_ONSTACK flag is set. // // signalStack is exclusive to the task goroutine. - signalStack arch.SignalStack + signalStack linux.SignalStack // signalQueue is a set of registered waiters for signal-related events. // @@ -395,7 +394,7 @@ type Task struct { // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. // // ptraceSiginfo is protected by the TaskSet mutex. - ptraceSiginfo *arch.SignalInfo + ptraceSiginfo *linux.SignalInfo // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to // the tracer by ptrace(PTRACE_GETEVENTMSG). @@ -853,15 +852,13 @@ func (t *Task) SetOOMScoreAdj(adj int32) error { return nil } -// UID returns t's uid. -// TODO(gvisor.dev/issue/170): This method is not namespaced yet. -func (t *Task) UID() uint32 { +// KUID returns t's kuid. +func (t *Task) KUID() uint32 { return uint32(t.Credentials().EffectiveKUID) } -// GID returns t's gid. -// TODO(gvisor.dev/issue/170): This method is not namespaced yet. -func (t *Task) GID() uint32 { +// KGID returns t's kgid. +func (t *Task) KGID() uint32 { return uint32(t.Credentials().EffectiveKGID) } diff --git a/pkg/sentry/kernel/task_cgroup.go b/pkg/sentry/kernel/task_cgroup.go index 25d2504fa..7c138e80f 100644 --- a/pkg/sentry/kernel/task_cgroup.go +++ b/pkg/sentry/kernel/task_cgroup.go @@ -85,6 +85,14 @@ func (t *Task) enterCgroupLocked(c Cgroup) { c.Enter(t) } +// +checklocks:t.mu +func (t *Task) enterCgroupIfNotYetLocked(c Cgroup) { + if _, ok := t.cgroups[c]; ok { + return + } + t.enterCgroupLocked(c) +} + // LeaveCgroups removes t out from all its cgroups. func (t *Task) LeaveCgroups() { t.mu.Lock() diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 70b0699dc..c82d9e82b 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -17,6 +17,7 @@ package kernel import ( "time" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -113,6 +114,10 @@ func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} { return t.k.RealtimeClock() case limits.CtxLimits: return t.tg.limits + case linux.CtxSignalNoInfoFunc: + return func(sig linux.Signal) error { + return t.SendSignal(SignalInfoNoInfo(sig, t, t)) + } case pgalloc.CtxMemoryFile: return t.k.mf case pgalloc.CtxMemoryFileProvider: diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index d9897e802..cf8571262 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -66,7 +66,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -181,7 +180,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() t.endStopCond.L = &t.tg.signalHandlers.mu // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) - t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable} + t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE} // "The termination signal is reset to SIGCHLD (see clone(2))." t.tg.terminationSignal = linux.SIGCHLD // execed indicates that the process can no longer join a process group diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b1af1a7ef..d115b8783 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -28,9 +28,9 @@ import ( "errors" "fmt" "strconv" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" @@ -50,6 +50,23 @@ type ExitStatus struct { Signo int } +func (es ExitStatus) String() string { + var b strings.Builder + if code := es.Code; code != 0 { + if b.Len() != 0 { + b.WriteByte(' ') + } + _, _ = fmt.Fprintf(&b, "Code=%d", code) + } + if signal := es.Signo; signal != 0 { + if b.Len() != 0 { + b.WriteByte(' ') + } + _, _ = fmt.Fprintf(&b, "Signal=%d", signal) + } + return b.String() +} + // Signaled returns true if the ExitStatus indicates that the exiting task or // thread group was killed by a signal. func (es ExitStatus) Signaled() bool { @@ -122,12 +139,12 @@ func (t *Task) killLocked() { if t.stop != nil && t.stop.Killable() { t.endInternalStopLocked() } - t.pendingSignals.enqueue(&arch.SignalInfo{ + t.pendingSignals.enqueue(&linux.SignalInfo{ Signo: int32(linux.SIGKILL), // Linux just sets SIGKILL in the pending signal bitmask without // enqueueing an actual siginfo, such that // kernel/signal.c:collect_signal() initializes si_code to SI_USER. - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }, nil) t.interrupt() } @@ -332,7 +349,7 @@ func (t *Task) exitThreadGroup() bool { // signalStop must be called with t's signal mutex unlocked. t.tg.signalHandlers.mu.Unlock() if notifyParent && t.tg.leader.parent != nil { - t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.signalStop(t, linux.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) } return last @@ -353,7 +370,7 @@ func (t *Task) exitChildren() { continue } other.signalHandlers.mu.Lock() - other.leader.sendSignalLocked(&arch.SignalInfo{ + other.leader.sendSignalLocked(&linux.SignalInfo{ Signo: int32(linux.SIGKILL), }, true /* group */) other.signalHandlers.mu.Unlock() @@ -368,9 +385,9 @@ func (t *Task) exitChildren() { // wait for a parent to reap them.) for c := range t.children { if sig := c.ParentDeathSignal(); sig != 0 { - siginfo := &arch.SignalInfo{ + siginfo := &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, } siginfo.SetPID(int32(c.tg.pidns.tids[t])) siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) @@ -652,10 +669,10 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { t.parent.tg.signalHandlers.mu.Lock() if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { - if act.Handler == arch.SignalActIgnore { + if act.Handler == linux.SIG_IGN { t.exitParentAcked = true signalParent = false - } else if act.Flags&arch.SignalFlagNoCldWait != 0 { + } else if act.Flags&linux.SA_NOCLDWAIT != 0 { t.exitParentAcked = true } } @@ -705,17 +722,17 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { } // Preconditions: The TaskSet mutex must be locked. -func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo { - info := &arch.SignalInfo{ +func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.SignalInfo { + info := &linux.SignalInfo{ Signo: int32(sig), } info.SetPID(int32(receiver.tg.pidns.tids[t])) info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { - info.Code = arch.CLD_KILLED + info.Code = linux.CLD_KILLED info.SetStatus(int32(t.exitStatus.Signo)) } else { - info.Code = arch.CLD_EXITED + info.Code = linux.CLD_EXITED info.SetStatus(int32(t.exitStatus.Code)) } // TODO(b/72102453): Set utime, stime. diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go index 9ba5f8d78..f142feab4 100644 --- a/pkg/sentry/kernel/task_sched.go +++ b/pkg/sentry/kernel/task_sched.go @@ -536,7 +536,7 @@ func (tg *ThreadGroup) updateCPUTimersEnabledLocked() { // appropriate for /proc/[pid]/status. func (t *Task) StateStatus() string { switch s := t.TaskGoroutineSchedInfo().State; s { - case TaskGoroutineNonexistent: + case TaskGoroutineNonexistent, TaskGoroutineRunningSys: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() switch t.exitState { @@ -546,16 +546,16 @@ func (t *Task) StateStatus() string { return "X (dead)" default: // The task goroutine can't exit before passing through - // runExitNotify, so this indicates that the task has been created, - // but the task goroutine hasn't yet started. The Linux equivalent - // is struct task_struct::state == TASK_NEW + // runExitNotify, so if s == TaskGoroutineNonexistent, the task has + // been created but the task goroutine hasn't yet started. The + // Linux equivalent is struct task_struct::state == TASK_NEW // (kernel/fork.c:copy_process() => // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is // masked out by TASK_REPORT for /proc/[pid]/status, leaving only // TASK_RUNNING. return "R (running)" } - case TaskGoroutineRunningSys, TaskGoroutineRunningApp: + case TaskGoroutineRunningApp: return "R (running)" case TaskGoroutineBlockedInterruptible: return "S (sleeping)" diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index c2b9fc08f..8ca61ed48 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -86,7 +86,7 @@ var defaultActions = map[linux.Signal]SignalAction{ } // computeAction figures out what to do given a signal number -// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop, +// and an linux.SigAction. SIGSTOP always results in a SignalActionStop, // and SIGKILL always results in a SignalActionTerm. // Signal 0 is always ignored as many programs use it for various internal functions // and don't expect it to do anything. @@ -97,7 +97,7 @@ var defaultActions = map[linux.Signal]SignalAction{ // 0, the default action is taken; // 1, the signal is ignored; // anything else, the function returns SignalActionHandler. -func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction { +func computeAction(sig linux.Signal, act linux.SigAction) SignalAction { switch sig { case linux.SIGSTOP: return SignalActionStop @@ -108,9 +108,9 @@ func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction { } switch act.Handler { - case arch.SignalActDefault: + case linux.SIG_DFL: return defaultActions[sig] - case arch.SignalActIgnore: + case linux.SIG_IGN: return SignalActionIgnore default: return SignalActionHandler @@ -127,7 +127,7 @@ var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTI // If there are no pending unmasked signals, dequeueSignalLocked returns nil. // // Preconditions: t.tg.signalHandlers.mu must be locked. -func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo { +func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *linux.SignalInfo { if info := t.pendingSignals.dequeue(mask); info != nil { return info } @@ -155,7 +155,7 @@ func (t *Task) PendingSignals() linux.SignalSet { } // deliverSignal delivers the given signal and returns the following run state. -func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState { +func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRunState { sigact := computeAction(linux.Signal(info.Signo), act) if t.haveSyscallReturn { @@ -172,7 +172,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS fallthrough case sre == syserror.ERESTART_RESTARTBLOCK: fallthrough - case (sre == syserror.ERESTARTSYS && !act.IsRestart()): + case (sre == syserror.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0): t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo) t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1))) default: @@ -236,7 +236,7 @@ func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunS // deliverSignalToHandler changes the task's userspace state to enter the given // user-configured handler for the given signal. -func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error { +func (t *Task) deliverSignalToHandler(info *linux.SignalInfo, act linux.SigAction) error { // Signal delivery to an application handler interrupts restartable // sequences. t.rseqInterrupt() @@ -248,8 +248,8 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // N.B. This is a *copy* of the alternate stack that the user's signal // handler expects to see in its ucontext (even if it's not in use). alt := t.signalStack - if act.IsOnStack() && alt.IsEnabled() { - alt.SetOnStack() + if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() { + alt.Flags |= linux.SS_ONSTACK if !alt.Contains(sp) { sp = hostarch.Addr(alt.Top()) } @@ -289,7 +289,7 @@ func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) // Add our signal mask. newMask := t.signalMask | act.Mask - if !act.IsNoDefer() { + if act.Flags&linux.SA_NODEFER == 0 { newMask |= linux.SignalSetOf(linux.Signal(info.Signo)) } t.SetSignalMask(newMask) @@ -326,7 +326,7 @@ func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) { // Preconditions: // * The caller must be running on the task goroutine. // * t.exitState < TaskExitZombie. -func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) { +func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux.SignalInfo, error) { // set is the set of signals we're interested in; invert it to get the set // of signals to block. mask := ^(set &^ UnblockableSignals) @@ -373,7 +373,7 @@ func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.S // syserror.EINVAL - The signal is not valid. // syserror.EAGAIN - THe signal is realtime, and cannot be queued. // -func (t *Task) SendSignal(info *arch.SignalInfo) error { +func (t *Task) SendSignal(info *linux.SignalInfo) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() @@ -382,7 +382,7 @@ func (t *Task) SendSignal(info *arch.SignalInfo) error { } // SendGroupSignal sends the given signal to t's thread group. -func (t *Task) SendGroupSignal(info *arch.SignalInfo) error { +func (t *Task) SendGroupSignal(info *linux.SignalInfo) error { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() @@ -392,7 +392,7 @@ func (t *Task) SendGroupSignal(info *arch.SignalInfo) error { // SendSignal sends the given signal to tg, using tg's leader to determine if // the signal is blocked. -func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error { +func (tg *ThreadGroup) SendSignal(info *linux.SignalInfo) error { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() @@ -400,11 +400,11 @@ func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error { return tg.leader.sendSignalLocked(info, true /* group */) } -func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error { +func (t *Task) sendSignalLocked(info *linux.SignalInfo, group bool) error { return t.sendSignalTimerLocked(info, group, nil) } -func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error { +func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *IntervalTimer) error { if t.exitState == TaskExitDead { return syserror.ESRCH } @@ -572,9 +572,9 @@ func (t *Task) forceSignal(sig linux.Signal, unconditional bool) { func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) { blocked := linux.SignalSetOf(sig)&t.signalMask != 0 act := t.tg.signalHandlers.actions[sig] - ignored := act.Handler == arch.SignalActIgnore + ignored := act.Handler == linux.SIG_IGN if blocked || ignored || unconditional { - act.Handler = arch.SignalActDefault + act.Handler = linux.SIG_DFL t.tg.signalHandlers.actions[sig] = act if blocked { t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig)) @@ -641,17 +641,17 @@ func (t *Task) SetSavedSignalMask(mask linux.SignalSet) { } // SignalStack returns the task-private signal stack. -func (t *Task) SignalStack() arch.SignalStack { +func (t *Task) SignalStack() linux.SignalStack { t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) alt := t.signalStack if t.onSignalStack(alt) { - alt.Flags |= arch.SignalStackFlagOnStack + alt.Flags |= linux.SS_ONSTACK } return alt } // onSignalStack returns true if the task is executing on the given signal stack. -func (t *Task) onSignalStack(alt arch.SignalStack) bool { +func (t *Task) onSignalStack(alt linux.SignalStack) bool { sp := hostarch.Addr(t.Arch().Stack()) return alt.Contains(sp) } @@ -661,30 +661,30 @@ func (t *Task) onSignalStack(alt arch.SignalStack) bool { // This value may not be changed if the task is currently executing on the // signal stack, i.e. if t.onSignalStack returns true. In this case, this // function will return false. Otherwise, true is returned. -func (t *Task) SetSignalStack(alt arch.SignalStack) bool { +func (t *Task) SetSignalStack(alt linux.SignalStack) bool { // Check that we're not executing on the stack. if t.onSignalStack(t.signalStack) { return false } - if alt.Flags&arch.SignalStackFlagDisable != 0 { + if alt.Flags&linux.SS_DISABLE != 0 { // Don't record anything beyond the flags. - t.signalStack = arch.SignalStack{ - Flags: arch.SignalStackFlagDisable, + t.signalStack = linux.SignalStack{ + Flags: linux.SS_DISABLE, } } else { // Mask out irrelevant parts: only disable matters. - alt.Flags &= arch.SignalStackFlagDisable + alt.Flags &= linux.SS_DISABLE t.signalStack = alt } return true } -// SetSignalAct atomically sets the thread group's signal action for signal sig +// SetSigAction atomically sets the thread group's signal action for signal sig // to *actptr (if actptr is not nil) and returns the old signal action. -func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) { +func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (linux.SigAction, error) { if !sig.IsValid() { - return arch.SignalAct{}, syserror.EINVAL + return linux.SigAction{}, syserror.EINVAL } tg.pidns.owner.mu.RLock() @@ -718,48 +718,6 @@ func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (a return oldact, nil } -// CopyOutSignalAct converts the given SignalAct into an architecture-specific -// type and then copies it out to task memory. -func (t *Task) CopyOutSignalAct(addr hostarch.Addr, s *arch.SignalAct) error { - n := t.Arch().NewSignalAct() - n.SerializeFrom(s) - _, err := n.CopyOut(t, addr) - return err -} - -// CopyInSignalAct copies an architecture-specific sigaction type from task -// memory and then converts it into a SignalAct. -func (t *Task) CopyInSignalAct(addr hostarch.Addr) (arch.SignalAct, error) { - n := t.Arch().NewSignalAct() - var s arch.SignalAct - if _, err := n.CopyIn(t, addr); err != nil { - return s, err - } - n.DeserializeTo(&s) - return s, nil -} - -// CopyOutSignalStack converts the given SignalStack into an -// architecture-specific type and then copies it out to task memory. -func (t *Task) CopyOutSignalStack(addr hostarch.Addr, s *arch.SignalStack) error { - n := t.Arch().NewSignalStack() - n.SerializeFrom(s) - _, err := n.CopyOut(t, addr) - return err -} - -// CopyInSignalStack copies an architecture-specific stack_t from task memory -// and then converts it into a SignalStack. -func (t *Task) CopyInSignalStack(addr hostarch.Addr) (arch.SignalStack, error) { - n := t.Arch().NewSignalStack() - var s arch.SignalStack - if _, err := n.CopyIn(t, addr); err != nil { - return s, err - } - n.DeserializeTo(&s) - return s, nil -} - // groupStop is a TaskStop placed on tasks that have received a stop signal // (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from // the ptrace man page.) @@ -774,7 +732,7 @@ func (*groupStop) Killable() bool { return true } // previously-dequeued stop signal. // // Preconditions: The caller must be running on the task goroutine. -func (t *Task) initiateGroupStop(info *arch.SignalInfo) { +func (t *Task) initiateGroupStop(info *linux.SignalInfo) { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() @@ -909,8 +867,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD] - if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) { - sigchld := &arch.SignalInfo{ + if !ok || (act.Handler != linux.SIG_IGN && act.Flags&linux.SA_NOCLDSTOP == 0) { + sigchld := &linux.SignalInfo{ Signo: int32(linux.SIGCHLD), Code: code, } @@ -955,14 +913,14 @@ func (*runInterrupt) execute(t *Task) taskRunState { // notified its tracer accordingly. But it's consistent with // Linux... if intr { - tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + tracer.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig)) if !notifyParent { tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop) } else { tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop) } } else { - tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + tracer.signalStop(t.tg.leader, linux.CLD_CONTINUED, int32(sig)) tracer.tg.eventQueue.Notify(EventGroupContinue) } } @@ -974,10 +932,10 @@ func (*runInterrupt) execute(t *Task) taskRunState { // SIGCHLD is a standard signal, so the latter would always be // dropped. Hence sending only the former is equivalent. if intr { - t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop) } else { - t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig)) + t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_CONTINUED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue) } } @@ -1018,7 +976,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { // without requiring an extra PTRACE_GETSIGINFO call." - // "Group-stop", ptrace(2) t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8 - t.ptraceSiginfo = &arch.SignalInfo{ + t.ptraceSiginfo = &linux.SignalInfo{ Signo: int32(sig), Code: t.ptraceCode, } @@ -1029,7 +987,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { t.ptraceSiginfo = nil } if t.beginPtraceStopLocked() { - tracer.signalStop(t, arch.CLD_STOPPED, int32(sig)) + tracer.signalStop(t, linux.CLD_STOPPED, int32(sig)) // For consistency with Linux, if the parent and tracer are in the // same thread group, deduplicate notification signals. if notifyParent && tracer.tg == t.tg.leader.parent.tg { @@ -1047,7 +1005,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { t.tg.signalHandlers.mu.Unlock() } if notifyParent { - t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig)) + t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) } t.tg.pidns.owner.mu.RUnlock() @@ -1101,7 +1059,7 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState { if sig != linux.Signal(info.Signo) { info.Signo = int32(sig) info.Errno = 0 - info.Code = arch.SignalInfoUser + info.Code = linux.SI_USER // pid isn't a valid field for all signal numbers, but Linux // doesn't care (kernel/signal.c:ptrace_signal()). // diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 32031cd70..41fd2d471 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" @@ -131,7 +130,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { runState: (*runApp)(nil), interruptChan: make(chan struct{}, 1), signalMask: cfg.SignalMask, - signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, image: *image, fsContext: cfg.FSContext, fdTable: cfg.FDTable, diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index b92e98fa1..891e2201d 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -446,10 +445,10 @@ func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error { othertg.signalHandlers.mu.Lock() othertg.tty = nil if othertg.processGroup == tg.processGroup.session.foreground { - if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil { + if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil { lastErr = err } - if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil { + if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil { lastErr = err } } @@ -490,10 +489,10 @@ func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() - // TODO(b/129283598): "If tcsetpgrp() is called by a member of a - // background process group in its session, and the calling process is - // not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all - // members of this background process group." + // TODO(gvisor.dev/issue/6148): "If tcsetpgrp() is called by a member of a + // background process group in its session, and the calling process is not + // blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all members of + // this background process group." // tty must be the controlling terminal. if tg.tty != tty { diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index f61a8e164..26aa34aa6 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -458,25 +458,6 @@ func NewTimer(clock Clock, listener TimerListener) *Timer { return t } -// After waits for the duration to elapse according to clock and then sends a -// notification on the returned channel. The timer is started immediately and -// will fire exactly once. The second return value is the start time used with -// the duration. -// -// Callers must call Timer.Destroy. -func After(clock Clock, duration time.Duration) (*Timer, Time, <-chan struct{}) { - notifier, tchan := NewChannelNotifier() - t := NewTimer(clock, notifier) - now := clock.Now() - - t.Swap(Setting{ - Enabled: true, - Period: 0, - Next: now.Add(duration), - }) - return t, now, tchan -} - // init initializes Timer state that is not preserved across save/restore. If // init has already been called, calling it again is a no-op. // diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index 3886b4d33..3e302d92c 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -59,7 +59,7 @@ func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.Fil // Linux silently truncates the remainder of the line if it exceeds // interpMaxLineLength. i := bytes.IndexByte(line, '\n') - if i > 0 { + if i >= 0 { line = line[:i] } diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index b81292c46..d1a883da4 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -1062,10 +1062,20 @@ func (f *MemoryFile) runReclaim() { break } - // If ManualZeroing is in effect, pages will be zeroed on allocation - // and may not be freed by decommitFile, so calling decommitFile is - // unnecessary. - if !f.opts.ManualZeroing { + if f.opts.ManualZeroing { + // If ManualZeroing is in effect, only hugepage-aligned regions may + // be safely passed to decommitFile. Pages will be zeroed on + // reallocation, so we don't need to perform any manual zeroing + // here, whether or not decommitFile succeeds. + if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok { + if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr { + decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)} + if err := f.decommitFile(decommitFR); err != nil { + log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err) + } + } + } + } else { if err := f.decommitFile(fr); err != nil { log.Warningf("Reclaim failed to decommit %v: %v", fr, err) // Zero the pages manually. This won't reduce memory usage, but at diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index b307832fd..8a490b3de 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -6,6 +6,8 @@ go_library( name = "kvm", srcs = [ "address_space.go", + "address_space_amd64.go", + "address_space_arm64.go", "bluepill.go", "bluepill_allocator.go", "bluepill_amd64.go", @@ -77,6 +79,7 @@ go_test( "requires-kvm", ], deps = [ + "//pkg/abi/linux", "//pkg/hostarch", "//pkg/ring0", "//pkg/ring0/pagetables", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index 5524e8727..9929caebb 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -85,15 +85,6 @@ type addressSpace struct { dirtySet *dirtySet } -// invalidate is the implementation for Invalidate. -func (as *addressSpace) invalidate() { - as.dirtySet.forEach(as.machine, func(c *vCPU) { - if c.active.get() == as { // If this happens to be active, - c.BounceToKernel() // ... force a kernel transition. - } - }) -} - // Invalidate interrupts all dirty contexts. func (as *addressSpace) Invalidate() { as.mu.Lock() diff --git a/pkg/sentry/platform/kvm/address_space_amd64.go b/pkg/sentry/platform/kvm/address_space_amd64.go new file mode 100644 index 000000000..d11d38679 --- /dev/null +++ b/pkg/sentry/platform/kvm/address_space_amd64.go @@ -0,0 +1,24 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +// invalidate is the implementation for Invalidate. +func (as *addressSpace) invalidate() { + as.dirtySet.forEach(as.machine, func(c *vCPU) { + if c.active.get() == as { // If this happens to be active, + c.BounceToKernel() // ... force a kernel transition. + } + }) +} diff --git a/pkg/sentry/platform/kvm/address_space_arm64.go b/pkg/sentry/platform/kvm/address_space_arm64.go new file mode 100644 index 000000000..fb954418b --- /dev/null +++ b/pkg/sentry/platform/kvm/address_space_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "gvisor.dev/gvisor/pkg/ring0" +) + +// invalidate is the implementation for Invalidate. +func (as *addressSpace) invalidate() { + bluepill(as.pageTables.Allocator.(*allocator).cpu) + ring0.FlushTlbAll() +} diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index f4d4473a8..183e741ea 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -17,6 +17,7 @@ package kvm import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" pkgcontext "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" @@ -32,15 +33,15 @@ type context struct { // machine is the parent machine, and is immutable. machine *machine - // info is the arch.SignalInfo cached for this context. - info arch.SignalInfo + // info is the linux.SignalInfo cached for this context. + info linux.SignalInfo // interrupt is the interrupt context. interrupt interrupt.Forwarder } // Switch runs the provided context in the given address space. -func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*arch.SignalInfo, hostarch.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, _ int32) (*linux.SignalInfo, hostarch.AccessType, error) { as := mm.AddressSpace() localAS := as.(*addressSpace) diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index b8dd1e4a5..b1cab89a0 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -19,6 +19,7 @@ package kvm import ( "testing" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -30,7 +31,7 @@ func TestSegments(t *testing.T) { applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestSegments(regs) for { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -55,7 +56,7 @@ func stmxcsr(addr *uint32) func TestMXCSR(t *testing.T) { applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo switchOpts := ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index ceff09a60..fe570aff9 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -22,6 +22,7 @@ import ( "time" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" @@ -157,7 +158,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func func TestApplicationSyscall(t *testing.T) { applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -171,7 +172,7 @@ func TestApplicationSyscall(t *testing.T) { return false }) applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -188,7 +189,7 @@ func TestApplicationSyscall(t *testing.T) { func TestApplicationFault(t *testing.T) { applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -203,7 +204,7 @@ func TestApplicationFault(t *testing.T) { }) applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -221,7 +222,7 @@ func TestRegistersSyscall(t *testing.T) { applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -244,7 +245,7 @@ func TestRegistersFault(t *testing.T) { applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -270,7 +271,7 @@ func TestBounce(t *testing.T) { time.Sleep(time.Millisecond) c.BounceToKernel() }() - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -285,7 +286,7 @@ func TestBounce(t *testing.T) { time.Sleep(time.Millisecond) c.BounceToKernel() }() - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -317,7 +318,7 @@ func TestBounceStress(t *testing.T) { c.BounceToKernel() }() randomSleep() - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -338,7 +339,7 @@ func TestInvalidate(t *testing.T) { applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, &data) // Read legitimate value. for { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -353,7 +354,7 @@ func TestInvalidate(t *testing.T) { // Unmap the page containing data & invalidate. pt.Unmap(hostarch.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(hostarch.PageSize-1)), hostarch.PageSize) for { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -371,13 +372,13 @@ func TestInvalidate(t *testing.T) { } // IsFault returns true iff the given signal represents a fault. -func IsFault(err error, si *arch.SignalInfo) bool { +func IsFault(err error, si *linux.SignalInfo) bool { return err == platform.ErrContextSignal && si.Signo == int32(unix.SIGSEGV) } func TestEmptyAddressSpace(t *testing.T) { applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -391,7 +392,7 @@ func TestEmptyAddressSpace(t *testing.T) { return false }) applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -467,7 +468,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { a int // Count for ErrContextInterrupt. ) applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, @@ -504,7 +505,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { a int ) applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo + var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, FloatingPointState: &dummyFPState, diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 9a2337654..7c063c7f5 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -23,11 +23,11 @@ import ( "runtime/debug" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" @@ -264,10 +264,10 @@ func (c *vCPU) setSystemTime() error { // nonCanonical generates a canonical address return. // //go:nosplit -func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (hostarch.AccessType, error) { - *info = arch.SignalInfo{ +func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { + *info = linux.SignalInfo{ Signo: signal, - Code: arch.SignalInfoKernel, + Code: linux.SI_KERNEL, } info.SetAddr(addr) // Include address. return hostarch.NoAccess, platform.ErrContextSignal @@ -276,7 +276,7 @@ func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (hostarch.Ac // fault generates an appropriate fault return. // //go:nosplit -func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType, error) { +func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { bluepill(c) // Probably no-op, but may not be. faultAddr := ring0.ReadCR2() code, user := c.ErrorCode() @@ -287,7 +287,7 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType, return hostarch.NoAccess, platform.ErrContextInterrupt } // Reset the pointed SignalInfo. - *info = arch.SignalInfo{Signo: signal} + *info = linux.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) accessType := hostarch.AccessType{ Read: code&(1<<1) == 0, @@ -325,7 +325,7 @@ func prefaultFloatingPointState(data *fpu.State) { } // SwitchToUser unpacks architectural-details. -func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (hostarch.AccessType, error) { +func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { // Check for canonical addresses. if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) { return nonCanonical(regs.Rip, int32(unix.SIGSEGV), info) @@ -371,7 +371,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return c.fault(int32(unix.SIGSEGV), info) case ring0.Debug, ring0.Breakpoint: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGTRAP), Code: 1, // TRAP_BRKPT (breakpoint). } @@ -383,9 +383,9 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) ring0.BoundRangeExceeded, ring0.InvalidTSS, ring0.StackSegmentFault: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGSEGV), - Code: arch.SignalInfoKernel, + Code: linux.SI_KERNEL, } info.SetAddr(switchOpts.Registers.Rip) // Include address. if vector == ring0.GeneralProtectionFault { @@ -397,7 +397,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return hostarch.AccessType{}, platform.ErrContextSignal case ring0.InvalidOpcode: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGILL), Code: 1, // ILL_ILLOPC (illegal opcode). } @@ -405,7 +405,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return hostarch.AccessType{}, platform.ErrContextSignal case ring0.DivideByZero: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGFPE), Code: 1, // FPE_INTDIV (divide by zero). } @@ -413,7 +413,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return hostarch.AccessType{}, platform.ErrContextSignal case ring0.Overflow: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGFPE), Code: 2, // FPE_INTOVF (integer overflow). } @@ -422,7 +422,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) case ring0.X87FloatingPointException, ring0.SIMDFloatingPointException: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGFPE), Code: 7, // FPE_FLTINV (invalid operation). } @@ -433,7 +433,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) return hostarch.NoAccess, platform.ErrContextInterrupt case ring0.AlignmentCheck: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGBUS), Code: 2, // BUS_ADRERR (physical address does not exist). } @@ -469,7 +469,7 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) { } func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { - // Map all the executible regions so that all the entry functions + // Map all the executable regions so that all the entry functions // are mapped in the upper half. applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 8926b1d9f..edaccf9bc 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -21,10 +21,10 @@ import ( "sync/atomic" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ) @@ -126,10 +126,10 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) { // nonCanonical generates a canonical address return. // //go:nosplit -func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (hostarch.AccessType, error) { - *info = arch.SignalInfo{ +func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { + *info = linux.SignalInfo{ Signo: signal, - Code: arch.SignalInfoKernel, + Code: linux.SI_KERNEL, } info.SetAddr(addr) // Include address. return hostarch.NoAccess, platform.ErrContextSignal @@ -157,7 +157,7 @@ func isWriteFault(code uint64) bool { // fault generates an appropriate fault return. // //go:nosplit -func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType, error) { +func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { bluepill(c) // Probably no-op, but may not be. faultAddr := c.GetFaultAddr() code, user := c.ErrorCode() @@ -170,7 +170,7 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (hostarch.AccessType, } // Reset the pointed SignalInfo. - *info = arch.SignalInfo{Signo: signal} + *info = linux.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) ret := code & _ESR_ELx_FSC diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 92edc992b..1b0a6e0a7 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -23,10 +23,10 @@ import ( "unsafe" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/arch/fpu" "gvisor.dev/gvisor/pkg/sentry/platform" ktime "gvisor.dev/gvisor/pkg/sentry/time" @@ -272,7 +272,7 @@ func (c *vCPU) getOneRegister(reg *kvmOneReg) error { } // SwitchToUser unpacks architectural-details. -func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (hostarch.AccessType, error) { +func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { // Check for canonical addresses. if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) { return nonCanonical(regs.Pc, int32(unix.SIGSEGV), info) @@ -319,14 +319,14 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) case ring0.El0SyncUndef: return c.fault(int32(unix.SIGILL), info) case ring0.El0SyncDbg: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGTRAP), Code: 1, // TRAP_BRKPT (breakpoint). } info.SetAddr(switchOpts.Registers.Pc) // Include address. return hostarch.AccessType{}, platform.ErrContextSignal case ring0.El0SyncSpPc: - *info = arch.SignalInfo{ + *info = linux.SignalInfo{ Signo: int32(unix.SIGBUS), Code: 2, // BUS_ADRERR (physical address does not exist). } diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index ef7814a6f..a26bc2316 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -195,8 +195,8 @@ type Context interface { // - nil: The Context invoked a system call. // // - ErrContextSignal: The Context was interrupted by a signal. The - // returned *arch.SignalInfo contains information about the signal. If - // arch.SignalInfo.Signo == SIGSEGV, the returned hostarch.AccessType + // returned *linux.SignalInfo contains information about the signal. If + // linux.SignalInfo.Signo == SIGSEGV, the returned hostarch.AccessType // contains the access type of the triggering fault. The caller owns // the returned SignalInfo. // @@ -207,7 +207,7 @@ type Context interface { // concurrent call to Switch(). // // - ErrContextCPUPreempted: See the definition of that error for details. - Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, hostarch.AccessType, error) + Switch(ctx context.Context, mm MemoryManager, ac arch.Context, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) // PullFullState() pulls a full state of the application thread. // diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 828458ce2..319b0cf1d 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -73,7 +73,7 @@ var ( type context struct { // signalInfo is the signal info, if and when a signal is received. - signalInfo arch.SignalInfo + signalInfo linux.SignalInfo // interrupt is the interrupt context. interrupt interrupt.Forwarder @@ -96,7 +96,7 @@ type context struct { } // Switch runs the provided context in the given address space. -func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*arch.SignalInfo, hostarch.AccessType, error) { +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac arch.Context, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) { as := mm.AddressSpace() s := as.(*subprocess) isSyscall := s.switchToApp(c, ac) diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index facb96011..cc93396a9 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -101,7 +101,7 @@ func (t *thread) setFPRegs(fpState *fpu.State, fpLen uint64, useXsave bool) erro } // getSignalInfo retrieves information about the signal that caused the stop. -func (t *thread) getSignalInfo(si *arch.SignalInfo) error { +func (t *thread) getSignalInfo(si *linux.SignalInfo) error { _, _, errno := unix.RawSyscall6( unix.SYS_PTRACE, unix.PTRACE_GETSIGINFO, diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 9c73a725a..0931795c5 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -20,6 +20,7 @@ import ( "runtime" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" @@ -524,7 +525,7 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool { // Check for interrupts, and ensure that future interrupts will signal t. if !c.interrupt.Enable(t) { // Pending interrupt; simulate. - c.signalInfo = arch.SignalInfo{Signo: int32(platform.SignalInterrupt)} + c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)} return false } defer c.interrupt.Disable() diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go index 9252c0bd7..90b1ead56 100644 --- a/pkg/sentry/platform/ptrace/subprocess_amd64.go +++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go @@ -155,7 +155,7 @@ func initChildProcessPPID(initregs *arch.Registers, ppid int32) { // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. -func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) { +func patchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) { if linux.Signal(signalInfo.Signo) == linux.SIGSYS { signalInfo.Signo = int32(linux.SIGSEGV) diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go index c0cbc0686..e4257e3bf 100644 --- a/pkg/sentry/platform/ptrace/subprocess_arm64.go +++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go @@ -138,7 +138,7 @@ func initChildProcessPPID(initregs *arch.Registers, ppid int32) { // // Note that this should only be called after verifying that the signalInfo has // been generated by the kernel. -func patchSignalInfo(regs *arch.Registers, signalInfo *arch.SignalInfo) { +func patchSignalInfo(regs *arch.Registers, signalInfo *linux.SignalInfo) { if linux.Signal(signalInfo.Signo) == linux.SIGSYS { signalInfo.Signo = int32(linux.SIGSEGV) diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go index d6a2fbe34..3fe5c6770 100644 --- a/pkg/sentry/sighandling/sighandling_unsafe.go +++ b/pkg/sentry/sighandling/sighandling_unsafe.go @@ -21,25 +21,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" ) -// FIXME(gvisor.dev/issue/214): Move to pkg/abi/linux along with definitions in -// pkg/sentry/arch. -type sigaction struct { - handler uintptr - flags uint64 - restorer uintptr - mask uint64 -} - // IgnoreChildStop sets the SA_NOCLDSTOP flag, causing child processes to not // generate SIGCHLD when they stop. func IgnoreChildStop() error { - var sa sigaction + var sa linux.SigAction // Get the existing signal handler information, and set the flag. if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(unix.SIGCHLD), 0, uintptr(unsafe.Pointer(&sa)), linux.SignalSetSize, 0, 0); e != 0 { return e } - sa.flags |= linux.SA_NOCLDSTOP + sa.Flags |= linux.SA_NOCLDSTOP if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(unix.SIGCHLD), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 { return e } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 2e3064565..3c6511ead 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -39,8 +39,6 @@ go_library( "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", - "//pkg/tcpip/network/ipv4", - "//pkg/tcpip/network/ipv6", "//pkg/tcpip/stack", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 393a1ab3a..cbb1e905d 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -35,8 +35,6 @@ import ( "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" - "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" - "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/usermem" ) @@ -66,8 +64,6 @@ type Stack struct { tcpSACKEnabled bool netDevFile *os.File netSNMPFile *os.File - ipv4Forwarding bool - ipv6Forwarding bool } // NewStack returns an empty Stack containing no configuration. @@ -127,13 +123,6 @@ func (s *Stack) Configure() error { s.netSNMPFile = f } - s.ipv6Forwarding = false - if ipForwarding, err := ioutil.ReadFile("/proc/sys/net/ipv6/conf/all/forwarding"); err == nil { - s.ipv6Forwarding = strings.TrimSpace(string(ipForwarding)) != "0" - } else { - log.Warningf("Failed to read if ipv6 forwarding is enabled, setting to false") - } - return nil } @@ -492,19 +481,6 @@ func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint { return nil } // RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints. func (s *Stack) RestoreCleanupEndpoints([]stack.TransportEndpoint) {} -// Forwarding implements inet.Stack.Forwarding. -func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { - switch protocol { - case ipv4.ProtocolNumber: - return s.ipv4Forwarding - case ipv6.ProtocolNumber: - return s.ipv6Forwarding - default: - log.Warningf("Forwarding(%v) failed: unsupported protocol", protocol) - return false - } -} - // SetForwarding implements inet.Stack.SetForwarding. func (s *Stack) SetForwarding(tcpip.NetworkProtocolNumber, bool) error { return syserror.EACCES diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index 61b2c9755..608474fa1 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -25,6 +25,7 @@ go_library( "//pkg/log", "//pkg/marshal", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/header", diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go index 6fc7781ad..3f1b4a17b 100644 --- a/pkg/sentry/socket/netfilter/extensions.go +++ b/pkg/sentry/socket/netfilter/extensions.go @@ -19,20 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/stack" ) -// TODO(gvisor.dev/issue/170): The following per-matcher params should be -// supported: -// - Table name -// - Match size -// - User size -// - Hooks -// - Proto -// - Family - // matchMaker knows how to (un)marshal the matcher named name(). type matchMaker interface { // name is the matcher name as stored in the xt_entry_match struct. @@ -43,7 +35,7 @@ type matchMaker interface { // unmarshal converts from the ABI matcher struct to an // stack.Matcher. - unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) + unmarshal(task *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) } type matcher interface { @@ -94,12 +86,12 @@ func marshalEntryMatch(name string, data []byte) []byte { return buf } -func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { +func unmarshalMatcher(task *kernel.Task, match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) { matchMaker, ok := matchMakers[match.Name.String()] if !ok { return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String()) } - return matchMaker.unmarshal(buf, filter) + return matchMaker.unmarshal(task, buf, filter) } // targetMaker knows how to (un)marshal a target. Once registered, diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go index cb78ef60b..d8bd86292 100644 --- a/pkg/sentry/socket/netfilter/ipv4.go +++ b/pkg/sentry/socket/netfilter/ipv4.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -123,7 +124,7 @@ func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTG return entries, info } -func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { +func modifyEntries4(task *kernel.Task, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. @@ -148,23 +149,19 @@ func modifyEntries4(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): We should support more IPTIP - // filtering fields. filter, err := filterFromIPTIP(entry.IP) if err != nil { nflog("bad iptip: %v", err) return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Matchers and targets can specify - // that they only work for certain protocols, hooks, tables. // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry if len(optVal) < int(matchersSize) { nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } - matchers, err := parseMatchers(filter, optVal[:matchersSize]) + matchers, err := parseMatchers(task, filter, optVal[:matchersSize]) if err != nil { nflog("failed to parse matchers: %v", err) return nil, syserr.ErrInvalidArgument diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go index 5cb7fe4aa..c68230847 100644 --- a/pkg/sentry/socket/netfilter/ipv6.go +++ b/pkg/sentry/socket/netfilter/ipv6.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -126,7 +127,7 @@ func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6T return entries, info } -func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { +func modifyEntries6(task *kernel.Task, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) { nflog("set entries: setting entries in table %q", replace.Name.String()) // Convert input into a list of rules and their offsets. @@ -151,23 +152,19 @@ func modifyEntries6(stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): We should support more IPTIP - // filtering fields. filter, err := filterFromIP6TIP(entry.IPv6) if err != nil { nflog("bad iptip: %v", err) return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Matchers and targets can specify - // that they only work for certain protocols, hooks, tables. // Get matchers. matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry if len(optVal) < int(matchersSize) { nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal)) return nil, syserr.ErrInvalidArgument } - matchers, err := parseMatchers(filter, optVal[:matchersSize]) + matchers, err := parseMatchers(task, filter, optVal[:matchersSize]) if err != nil { nflog("failed to parse matchers: %v", err) return nil, syserr.ErrInvalidArgument diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index f42d73178..e3eade180 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -58,8 +58,8 @@ var nameToID = map[string]stack.TableID{ // DefaultLinuxTables returns the rules of stack.DefaultTables() wrapped for // compatibility with netfilter extensions. -func DefaultLinuxTables() *stack.IPTables { - tables := stack.DefaultTables() +func DefaultLinuxTables(seed uint32) *stack.IPTables { + tables := stack.DefaultTables(seed) tables.VisitTargets(func(oldTarget stack.Target) stack.Target { switch val := oldTarget.(type) { case *stack.AcceptTarget: @@ -174,13 +174,12 @@ func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint // SetEntries sets iptables rules for a single table. See // net/ipv4/netfilter/ip_tables.c:translate_table for reference. -func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { +func SetEntries(task *kernel.Task, stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { var replace linux.IPTReplace replaceBuf := optVal[:linux.SizeOfIPTReplace] optVal = optVal[linux.SizeOfIPTReplace:] replace.UnmarshalBytes(replaceBuf) - // TODO(gvisor.dev/issue/170): Support other tables. var table stack.Table switch replace.Name.String() { case filterTable: @@ -188,16 +187,16 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { case natTable: table = stack.EmptyNATTable() default: - nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String()) + nflog("unknown iptables table %q", replace.Name.String()) return syserr.ErrInvalidArgument } var err *syserr.Error var offsets map[uint32]int if ipv6 { - offsets, err = modifyEntries6(stk, optVal, &replace, &table) + offsets, err = modifyEntries6(task, stk, optVal, &replace, &table) } else { - offsets, err = modifyEntries4(stk, optVal, &replace, &table) + offsets, err = modifyEntries4(task, stk, optVal, &replace, &table) } if err != nil { return err @@ -272,7 +271,6 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { table.Rules[ruleIdx] = rule } - // TODO(gvisor.dev/issue/170): Support other chains. // Since we don't support FORWARD, yet, make sure all other chains point to // ACCEPT rules. for hook, ruleIdx := range table.BuiltinChains { @@ -287,7 +285,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { } } - // TODO(gvisor.dev/issue/170): Check the following conditions: + // TODO(gvisor.dev/issue/6167): Check the following conditions: // - There are no loops. // - There are no chains without an unconditional final rule. // - There are no chains without an unconditional underflow rule. @@ -297,7 +295,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error { // parseMatchers parses 0 or more matchers from optVal. optVal should contain // only the matchers. -func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) { +func parseMatchers(task *kernel.Task, filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) { nflog("set entries: parsing matchers of size %d", len(optVal)) var matchers []stack.Matcher for len(optVal) > 0 { @@ -321,13 +319,13 @@ func parseMatchers(filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, } // Parse the specific matcher. - matcher, err := unmarshalMatcher(match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) + matcher, err := unmarshalMatcher(task, match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize]) if err != nil { return nil, fmt.Errorf("failed to create matcher: %v", err) } matchers = append(matchers, matcher) - // TODO(gvisor.dev/issue/170): Check the revision field. + // TODO(gvisor.dev/issue/6167): Check the revision field. optVal = optVal[match.MatchSize:] } diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go index 60845cab3..6eff2ae65 100644 --- a/pkg/sentry/socket/netfilter/owner_matcher.go +++ b/pkg/sentry/socket/netfilter/owner_matcher.go @@ -19,6 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -40,8 +42,8 @@ func (ownerMarshaler) name() string { func (ownerMarshaler) marshal(mr matcher) []byte { matcher := mr.(*OwnerMatcher) iptOwnerInfo := linux.IPTOwnerInfo{ - UID: matcher.uid, - GID: matcher.gid, + UID: uint32(matcher.uid), + GID: uint32(matcher.gid), } // Support for UID and GID match. @@ -63,7 +65,7 @@ func (ownerMarshaler) marshal(mr matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { +func (ownerMarshaler) unmarshal(task *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfIPTOwnerInfo { return nil, fmt.Errorf("buf has insufficient size for owner match: %d", len(buf)) } @@ -72,11 +74,12 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack. // exceed what's strictly necessary to hold matchData. var matchData linux.IPTOwnerInfo matchData.UnmarshalUnsafe(buf[:linux.SizeOfIPTOwnerInfo]) - nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData) + nflog("parsed IPTOwnerInfo: %+v", matchData) var owner OwnerMatcher - owner.uid = matchData.UID - owner.gid = matchData.GID + creds := task.Credentials() + owner.uid = creds.UserNamespace.MapToKUID(auth.UID(matchData.UID)) + owner.gid = creds.UserNamespace.MapToKGID(auth.GID(matchData.GID)) // Check flags. if matchData.Match&linux.XT_OWNER_UID != 0 { @@ -97,8 +100,8 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack. // OwnerMatcher matches against a UID and/or GID. type OwnerMatcher struct { - uid uint32 - gid uint32 + uid auth.KUID + gid auth.KGID matchUID bool matchGID bool invertUID bool @@ -113,7 +116,6 @@ func (*OwnerMatcher) name() string { // Match implements Matcher.Match. func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { // Support only for OUTPUT chain. - // TODO(gvisor.dev/issue/170): Need to support for POSTROUTING chain also. if hook != stack.Output { return false, true } @@ -126,7 +128,7 @@ func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ str var matches bool // Check for UID match. if om.matchUID { - if pkt.Owner.UID() == om.uid { + if auth.KUID(pkt.Owner.KUID()) == om.uid { matches = true } if matches == om.invertUID { @@ -137,7 +139,7 @@ func (om *OwnerMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ str // Check for GID match. if om.matchGID { matches = false - if pkt.Owner.GID() == om.gid { + if auth.KGID(pkt.Owner.KGID()) == om.gid { matches = true } if matches == om.invertGID { diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go index fa5456eee..7d83e708f 100644 --- a/pkg/sentry/socket/netfilter/targets.go +++ b/pkg/sentry/socket/netfilter/targets.go @@ -331,7 +331,6 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) ( return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Check if the flags are valid. // Also check if we need to map ports or IP. // For now, redirect target only supports destination port change. // Port range and IP range are not supported yet. @@ -340,7 +339,6 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) ( return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Port range is not supported yet. if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument @@ -502,7 +500,6 @@ func (*snatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (ta return nil, syserr.ErrInvalidArgument } - // TODO(gvisor.dev/issue/170): Port range is not supported yet. if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort { nflog("snatTargetMakerV4: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort) return nil, syserr.ErrInvalidArgument @@ -594,7 +591,6 @@ func (*snatTargetMakerV6) unmarshal(buf []byte, filter stack.IPHeaderFilter) (ta // translateToStandardTarget translates from the value in a // linux.XTStandardTarget to an stack.Verdict. func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (target, *syserr.Error) { - // TODO(gvisor.dev/issue/170): Support other verdicts. switch val { case -linux.NF_ACCEPT - 1: return &acceptTarget{stack.AcceptTarget{ diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go index 95bb9826e..e5b73a976 100644 --- a/pkg/sentry/socket/netfilter/tcp_matcher.go +++ b/pkg/sentry/socket/netfilter/tcp_matcher.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -50,7 +51,7 @@ func (tcpMarshaler) marshal(mr matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (tcpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { +func (tcpMarshaler) unmarshal(_ *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTTCP { return nil, fmt.Errorf("buf has insufficient size for TCP match: %d", len(buf)) } @@ -95,8 +96,6 @@ func (*TCPMatcher) name() string { // Match implements Matcher.Match. func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved - // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: netHeader := header.IPv4(pkt.NetworkHeader().View()) diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go index fb8be27e6..aa72ee70c 100644 --- a/pkg/sentry/socket/netfilter/udp_matcher.go +++ b/pkg/sentry/socket/netfilter/udp_matcher.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/marshal" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" ) @@ -50,7 +51,7 @@ func (udpMarshaler) marshal(mr matcher) []byte { } // unmarshal implements matchMaker.unmarshal. -func (udpMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { +func (udpMarshaler) unmarshal(_ *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error) { if len(buf) < linux.SizeOfXTUDP { return nil, fmt.Errorf("buf has insufficient size for UDP match: %d", len(buf)) } @@ -92,8 +93,6 @@ func (*UDPMatcher) name() string { // Match implements Matcher.Match. func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, _, _ string) (bool, bool) { - // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved - // into the stack.Check codepath as matchers are added. switch pkt.NetworkProtocolNumber { case header.IPv4ProtocolNumber: netHeader := header.IPv4(pkt.NetworkHeader().View()) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 0b64a24c3..11ba80497 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -77,41 +77,59 @@ func mustCreateGauge(name, description string) *tcpip.StatCounter { // Metrics contains metrics exported by netstack. var Metrics = tcpip.Stats{ - UnknownProtocolRcvdPackets: mustCreateMetric("/netstack/unknown_protocol_received_packets", "Number of packets received by netstack that were for an unknown or unsupported protocol."), - MalformedRcvdPackets: mustCreateMetric("/netstack/malformed_received_packets", "Number of packets received by netstack that were deemed malformed."), - DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped by netstack due to full queues."), + DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."), + NICs: tcpip.NICStats{ + UnknownL3ProtocolRcvdPackets: mustCreateMetric("/netstack/nic/unknown_l3_protocol_received_packets", "Number of packets received that were for an unknown or unsupported L3 protocol."), + UnknownL4ProtocolRcvdPackets: mustCreateMetric("/netstack/nic/unknown_l4_protocol_received_packets", "Number of packets received that were for an unknown or unsupported L4 protocol."), + MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."), + Tx: tcpip.NICPacketStats{ + Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."), + Bytes: mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."), + }, + Rx: tcpip.NICPacketStats{ + Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."), + Bytes: mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."), + }, + DisabledRx: tcpip.NICPacketStats{ + Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."), + Bytes: mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."), + }, + Neighbor: tcpip.NICNeighborStats{ + UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."), + }, + }, ICMP: tcpip.ICMPStats{ V4: tcpip.ICMPv4Stats{ PacketsSent: tcpip.ICMPv4SentPacketStats{ ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ - EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent by netstack."), - SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent by netstack."), - Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent by netstack."), - Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent by netstack."), - TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent by netstack."), - InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent by netstack."), - InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent by netstack."), + EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."), + EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."), + SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."), + Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."), + ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."), + Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."), + TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."), + InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."), + InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."), }, - Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped by netstack due to link layer errors."), - RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped by netstack due to rate limit being exceeded."), + Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."), + RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."), }, PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ - EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received by netstack."), - SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received by netstack."), - Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received by netstack."), - Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received by netstack."), - TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received by netstack."), - InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received by netstack."), - InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received by netstack."), + EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."), + EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."), + SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."), + Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."), + ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."), + Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."), + TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."), + InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."), + InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."), }, Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."), }, @@ -119,40 +137,40 @@ var Metrics = tcpip.Stats{ V6: tcpip.ICMPv6Stats{ PacketsSent: tcpip.ICMPv6SentPacketStats{ ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ - EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent by netstack."), - PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent by netstack."), - RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent by netstack."), - RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent by netstack."), - NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent by netstack."), - NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent by netstack."), - RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent by netstack."), - MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent by netstack."), - MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent by netstack."), - MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent by netstack."), + EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."), + EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."), + PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."), + ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."), + RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."), + RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."), + NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."), + NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."), + RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."), + MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."), + MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), + MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), }, - Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped by netstack due to link layer errors."), - RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped by netstack due to rate limit being exceeded."), + Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."), + RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."), }, PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ - EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received by netstack."), - EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received by netstack."), - DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received by netstack."), - PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received by netstack."), - TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received by netstack."), - ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received by netstack."), - RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received by netstack."), - RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received by netstack."), - NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received by netstack."), - NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received by netstack."), - RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received by netstack."), - MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received by netstack."), - MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent by netstack."), - MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent by netstack."), + EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."), + EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."), + DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."), + PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."), + TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."), + ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."), + RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."), + RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."), + NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."), + NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."), + RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."), + MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."), + MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), + MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), }, Unrecognized: mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."), Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."), @@ -163,23 +181,23 @@ var Metrics = tcpip.Stats{ IGMP: tcpip.IGMPStats{ PacketsSent: tcpip.IGMPSentPacketStats{ IGMPPacketStats: tcpip.IGMPPacketStats{ - MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent by netstack."), - V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent by netstack."), - V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent by netstack."), - LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent by netstack."), + MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."), + V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."), + V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."), + LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."), }, - Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped by netstack due to link layer errors."), + Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."), }, PacketsReceived: tcpip.IGMPReceivedPacketStats{ IGMPPacketStats: tcpip.IGMPPacketStats{ - MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received by netstack."), - V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received by netstack."), - V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received by netstack."), - LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received by netstack."), + MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."), + V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."), + V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."), + LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."), }, - Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received by netstack that could not be parsed."), + Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."), ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."), - Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received by netstack."), + Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."), }, }, IP: tcpip.IPStats{ @@ -205,7 +223,8 @@ var Metrics = tcpip.Stats{ LinkLocalSource: mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."), LinkLocalDestination: mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."), ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."), - PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not fit within the outgoing MTU."), + PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."), + HostUnreachable: mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."), Errors: mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."), }, }, @@ -1126,7 +1145,14 @@ func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, // TODO(b/64800844): Translate fields once they are added to // tcpip.TCPInfoOption. - info := linux.TCPInfo{} + info := linux.TCPInfo{ + State: uint8(v.State), + RTO: uint32(v.RTO / time.Microsecond), + RTT: uint32(v.RTT / time.Microsecond), + RTTVar: uint32(v.RTTVar / time.Microsecond), + SndSsthresh: v.SndSsthresh, + SndCwnd: v.SndCwnd, + } switch v.CcState { case tcpip.RTORecovery: info.CaState = linux.TCP_CA_Loss @@ -1137,11 +1163,6 @@ func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, case tcpip.Open: info.CaState = linux.TCP_CA_Open } - info.RTO = uint32(v.RTO / time.Microsecond) - info.RTT = uint32(v.RTT / time.Microsecond) - info.RTTVar = uint32(v.RTTVar / time.Microsecond) - info.SndSsthresh = v.SndSsthresh - info.SndCwnd = v.SndCwnd // In netstack reorderSeen is updated only when RACK is enabled. // We only track whether the reordering is seen, which is @@ -1777,11 +1798,6 @@ func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, nam } v := hostarch.ByteOrder.Uint32(optVal) - - if v == 0 { - socket.SetSockOptEmitUnimplementedEvent(t, name) - } - ep.SocketOptions().SetOutOfBandInline(v != 0) return nil @@ -2089,10 +2105,10 @@ func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(stack.(*Stack).Stack, optVal, true) + return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, true) case linux.IP6T_SO_SET_ADD_COUNTERS: - // TODO(gvisor.dev/issue/170): Counter support. + log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") return nil default: @@ -2332,10 +2348,10 @@ func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name in return syserr.ErrNoDevice } // Stack must be a netstack stack. - return netfilter.SetEntries(stack.(*Stack).Stack, optVal, false) + return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, false) case linux.IPT_SO_SET_ADD_COUNTERS: - // TODO(gvisor.dev/issue/170): Counter support. + log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") return nil case linux.IP_ADD_SOURCE_MEMBERSHIP, diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 9cc1c57d7..eef5e6519 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -458,16 +458,6 @@ func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) { s.Stack.RestoreCleanupEndpoints(es) } -// Forwarding implements inet.Stack.Forwarding. -func (s *Stack) Forwarding(protocol tcpip.NetworkProtocolNumber) bool { - switch protocol { - case ipv4.ProtocolNumber, ipv6.ProtocolNumber: - return s.Stack.Forwarding(protocol) - default: - panic(fmt.Sprintf("Forwarding(%v) failed: unsupported protocol", protocol)) - } -} - // SetForwarding implements inet.Stack.SetForwarding. func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error { if err := s.Stack.SetForwardingDefaultAndAllNICs(protocol, enable); err != nil { diff --git a/pkg/sentry/socket/netstack/tun.go b/pkg/sentry/socket/netstack/tun.go index 288dd0c9e..c7ed52702 100644 --- a/pkg/sentry/socket/netstack/tun.go +++ b/pkg/sentry/socket/netstack/tun.go @@ -40,7 +40,7 @@ func LinuxToTUNFlags(flags uint16) (tun.Flags, error) { // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) // when there is no sk_filter. See __tun_chr_ioctl() in // net/drivers/tun.c. - if flags&^uint16(linux.IFF_TUN|linux.IFF_TAP|linux.IFF_NO_PI) != 0 { + if flags&^uint16(linux.IFF_TUN|linux.IFF_TAP|linux.IFF_NO_PI|linux.IFF_ONE_QUEUE) != 0 { return tun.Flags{}, syserror.EINVAL } return tun.Flags{ diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index 1fbbd133c..369541c7a 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -11,6 +11,7 @@ go_library( "futex.go", "linux64_amd64.go", "linux64_arm64.go", + "mmap.go", "open.go", "poll.go", "ptrace.go", @@ -35,7 +36,6 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/netlink", "//pkg/sentry/syscalls/linux", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/strace/clone.go b/pkg/sentry/strace/clone.go index ab1060426..bfb4d7f5c 100644 --- a/pkg/sentry/strace/clone.go +++ b/pkg/sentry/strace/clone.go @@ -15,98 +15,98 @@ package strace import ( - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" ) // CloneFlagSet is the set of clone(2) flags. var CloneFlagSet = abi.FlagSet{ { - Flag: unix.CLONE_VM, + Flag: linux.CLONE_VM, Name: "CLONE_VM", }, { - Flag: unix.CLONE_FS, + Flag: linux.CLONE_FS, Name: "CLONE_FS", }, { - Flag: unix.CLONE_FILES, + Flag: linux.CLONE_FILES, Name: "CLONE_FILES", }, { - Flag: unix.CLONE_SIGHAND, + Flag: linux.CLONE_SIGHAND, Name: "CLONE_SIGHAND", }, { - Flag: unix.CLONE_PTRACE, + Flag: linux.CLONE_PTRACE, Name: "CLONE_PTRACE", }, { - Flag: unix.CLONE_VFORK, + Flag: linux.CLONE_VFORK, Name: "CLONE_VFORK", }, { - Flag: unix.CLONE_PARENT, + Flag: linux.CLONE_PARENT, Name: "CLONE_PARENT", }, { - Flag: unix.CLONE_THREAD, + Flag: linux.CLONE_THREAD, Name: "CLONE_THREAD", }, { - Flag: unix.CLONE_NEWNS, + Flag: linux.CLONE_NEWNS, Name: "CLONE_NEWNS", }, { - Flag: unix.CLONE_SYSVSEM, + Flag: linux.CLONE_SYSVSEM, Name: "CLONE_SYSVSEM", }, { - Flag: unix.CLONE_SETTLS, + Flag: linux.CLONE_SETTLS, Name: "CLONE_SETTLS", }, { - Flag: unix.CLONE_PARENT_SETTID, + Flag: linux.CLONE_PARENT_SETTID, Name: "CLONE_PARENT_SETTID", }, { - Flag: unix.CLONE_CHILD_CLEARTID, + Flag: linux.CLONE_CHILD_CLEARTID, Name: "CLONE_CHILD_CLEARTID", }, { - Flag: unix.CLONE_DETACHED, + Flag: linux.CLONE_DETACHED, Name: "CLONE_DETACHED", }, { - Flag: unix.CLONE_UNTRACED, + Flag: linux.CLONE_UNTRACED, Name: "CLONE_UNTRACED", }, { - Flag: unix.CLONE_CHILD_SETTID, + Flag: linux.CLONE_CHILD_SETTID, Name: "CLONE_CHILD_SETTID", }, { - Flag: unix.CLONE_NEWUTS, + Flag: linux.CLONE_NEWUTS, Name: "CLONE_NEWUTS", }, { - Flag: unix.CLONE_NEWIPC, + Flag: linux.CLONE_NEWIPC, Name: "CLONE_NEWIPC", }, { - Flag: unix.CLONE_NEWUSER, + Flag: linux.CLONE_NEWUSER, Name: "CLONE_NEWUSER", }, { - Flag: unix.CLONE_NEWPID, + Flag: linux.CLONE_NEWPID, Name: "CLONE_NEWPID", }, { - Flag: unix.CLONE_NEWNET, + Flag: linux.CLONE_NEWNET, Name: "CLONE_NEWNET", }, { - Flag: unix.CLONE_IO, + Flag: linux.CLONE_IO, Name: "CLONE_IO", }, } diff --git a/pkg/sentry/strace/linux64_amd64.go b/pkg/sentry/strace/linux64_amd64.go index d66befe81..6ce1bb592 100644 --- a/pkg/sentry/strace/linux64_amd64.go +++ b/pkg/sentry/strace/linux64_amd64.go @@ -33,7 +33,7 @@ var linuxAMD64 = SyscallMap{ 6: makeSyscallInfo("lstat", Path, Stat), 7: makeSyscallInfo("poll", PollFDs, Hex, Hex), 8: makeSyscallInfo("lseek", Hex, Hex, Hex), - 9: makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex), + 9: makeSyscallInfo("mmap", Hex, Hex, MmapProt, MmapFlags, FD, Hex), 10: makeSyscallInfo("mprotect", Hex, Hex, Hex), 11: makeSyscallInfo("munmap", Hex, Hex), 12: makeSyscallInfo("brk", Hex), diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go index 1a2d7d75f..ce5594301 100644 --- a/pkg/sentry/strace/linux64_arm64.go +++ b/pkg/sentry/strace/linux64_arm64.go @@ -246,7 +246,7 @@ var linuxARM64 = SyscallMap{ 219: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex), 220: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex), 221: makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector), - 222: makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex), + 222: makeSyscallInfo("mmap", Hex, Hex, MmapProt, MmapFlags, FD, Hex), 223: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex), 224: makeSyscallInfo("swapon", Hex, Hex), 225: makeSyscallInfo("swapoff", Hex), diff --git a/pkg/sentry/strace/mmap.go b/pkg/sentry/strace/mmap.go new file mode 100644 index 000000000..0035be586 --- /dev/null +++ b/pkg/sentry/strace/mmap.go @@ -0,0 +1,92 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strace + +import ( + "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// ProtectionFlagSet represents the protection to mmap(2). +var ProtectionFlagSet = abi.FlagSet{ + { + Flag: linux.PROT_READ, + Name: "PROT_READ", + }, + { + Flag: linux.PROT_WRITE, + Name: "PROT_WRITE", + }, + { + Flag: linux.PROT_EXEC, + Name: "PROT_EXEC", + }, +} + +// MmapFlagSet is the set of mmap(2) flags. +var MmapFlagSet = abi.FlagSet{ + { + Flag: linux.MAP_SHARED, + Name: "MAP_SHARED", + }, + { + Flag: linux.MAP_PRIVATE, + Name: "MAP_PRIVATE", + }, + { + Flag: linux.MAP_FIXED, + Name: "MAP_FIXED", + }, + { + Flag: linux.MAP_ANONYMOUS, + Name: "MAP_ANONYMOUS", + }, + { + Flag: linux.MAP_GROWSDOWN, + Name: "MAP_GROWSDOWN", + }, + { + Flag: linux.MAP_DENYWRITE, + Name: "MAP_DENYWRITE", + }, + { + Flag: linux.MAP_EXECUTABLE, + Name: "MAP_EXECUTABLE", + }, + { + Flag: linux.MAP_LOCKED, + Name: "MAP_LOCKED", + }, + { + Flag: linux.MAP_NORESERVE, + Name: "MAP_NORESERVE", + }, + { + Flag: linux.MAP_POPULATE, + Name: "MAP_POPULATE", + }, + { + Flag: linux.MAP_NONBLOCK, + Name: "MAP_NONBLOCK", + }, + { + Flag: linux.MAP_STACK, + Name: "MAP_STACK", + }, + { + Flag: linux.MAP_HUGETLB, + Name: "MAP_HUGETLB", + }, +} diff --git a/pkg/sentry/strace/open.go b/pkg/sentry/strace/open.go index 5769360da..e7c7649f4 100644 --- a/pkg/sentry/strace/open.go +++ b/pkg/sentry/strace/open.go @@ -15,61 +15,61 @@ package strace import ( - "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/abi/linux" ) // OpenMode represents the mode to open(2) a file. var OpenMode = abi.ValueSet{ - unix.O_RDWR: "O_RDWR", - unix.O_WRONLY: "O_WRONLY", - unix.O_RDONLY: "O_RDONLY", + linux.O_RDWR: "O_RDWR", + linux.O_WRONLY: "O_WRONLY", + linux.O_RDONLY: "O_RDONLY", } // OpenFlagSet is the set of open(2) flags. var OpenFlagSet = abi.FlagSet{ { - Flag: unix.O_APPEND, + Flag: linux.O_APPEND, Name: "O_APPEND", }, { - Flag: unix.O_ASYNC, + Flag: linux.O_ASYNC, Name: "O_ASYNC", }, { - Flag: unix.O_CLOEXEC, + Flag: linux.O_CLOEXEC, Name: "O_CLOEXEC", }, { - Flag: unix.O_CREAT, + Flag: linux.O_CREAT, Name: "O_CREAT", }, { - Flag: unix.O_DIRECT, + Flag: linux.O_DIRECT, Name: "O_DIRECT", }, { - Flag: unix.O_DIRECTORY, + Flag: linux.O_DIRECTORY, Name: "O_DIRECTORY", }, { - Flag: unix.O_EXCL, + Flag: linux.O_EXCL, Name: "O_EXCL", }, { - Flag: unix.O_NOATIME, + Flag: linux.O_NOATIME, Name: "O_NOATIME", }, { - Flag: unix.O_NOCTTY, + Flag: linux.O_NOCTTY, Name: "O_NOCTTY", }, { - Flag: unix.O_NOFOLLOW, + Flag: linux.O_NOFOLLOW, Name: "O_NOFOLLOW", }, { - Flag: unix.O_NONBLOCK, + Flag: linux.O_NONBLOCK, Name: "O_NONBLOCK", }, { @@ -77,18 +77,22 @@ var OpenFlagSet = abi.FlagSet{ Name: "O_PATH", }, { - Flag: unix.O_SYNC, + Flag: linux.O_SYNC, Name: "O_SYNC", }, { - Flag: unix.O_TRUNC, + Flag: linux.O_TMPFILE, + Name: "O_TMPFILE", + }, + { + Flag: linux.O_TRUNC, Name: "O_TRUNC", }, } func open(val uint64) string { - s := OpenMode.Parse(val & unix.O_ACCMODE) - if flags := OpenFlagSet.Parse(val &^ unix.O_ACCMODE); flags != "" { + s := OpenMode.Parse(val & linux.O_ACCMODE) + if flags := OpenFlagSet.Parse(val &^ linux.O_ACCMODE); flags != "" { s += "|" + flags } return s diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go index e5b379a20..5afc9525b 100644 --- a/pkg/sentry/strace/signal.go +++ b/pkg/sentry/strace/signal.go @@ -130,8 +130,8 @@ func sigAction(t *kernel.Task, addr hostarch.Addr) string { return "null" } - sa, err := t.CopyInSignalAct(addr) - if err != nil { + var sa linux.SigAction + if _, err := sa.CopyIn(t, addr); err != nil { return fmt.Sprintf("%#x (error copying sigaction: %v)", addr, err) } diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index ec5d5f846..af7088847 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -489,6 +489,10 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo output = append(output, epollEvents(t, args[arg].Pointer(), 0 /* numEvents */, uint64(maximumBlobSize))) case SelectFDSet: output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer())) + case MmapProt: + output = append(output, ProtectionFlagSet.Parse(uint64(args[arg].Uint()))) + case MmapFlags: + output = append(output, MmapFlagSet.Parse(uint64(args[arg].Uint()))) case Oct: output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8)) case Hex: diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go index 7e69b9279..5893443a7 100644 --- a/pkg/sentry/strace/syscalls.go +++ b/pkg/sentry/strace/syscalls.go @@ -238,6 +238,12 @@ const ( // EpollEvents is an array of struct epoll_event. It is the events // argument in epoll_wait(2)/epoll_pwait(2). EpollEvents + + // MmapProt is the protection argument in mmap(2). + MmapProt + + // MmapFlags is the flags argument in mmap(2). + MmapFlags ) // defaultFormat is the syscall argument format to use if the actual format is diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 9cd238efd..90a719ba2 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -1569,9 +1569,9 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { - t.SendSignal(&arch.SignalInfo{ + t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }) return 0, nil, syserror.EFBIG } @@ -1632,9 +1632,9 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { - t.SendSignal(&arch.SignalInfo{ + t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }) return 0, nil, syserror.EFBIG } @@ -1673,9 +1673,11 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { if err != nil { return err } + c := t.Credentials() hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN) isOwner := uattr.Owner.UID == c.EffectiveKUID + var clearPrivilege bool if uid.Ok() { kuid := c.UserNamespace.MapToKUID(uid) // Valid UID must be supplied if UID is to be changed. @@ -1693,6 +1695,11 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { return syserror.EPERM } + // The setuid and setgid bits are cleared during a chown. + if uattr.Owner.UID != kuid { + clearPrivilege = true + } + owner.UID = kuid } if gid.Ok() { @@ -1711,6 +1718,11 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { return syserror.EPERM } + // The setuid and setgid bits are cleared during a chown. + if uattr.Owner.GID != kgid { + clearPrivilege = true + } + owner.GID = kgid } @@ -1721,10 +1733,14 @@ func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error { if err := d.Inode.SetOwner(t, d, owner); err != nil { return err } + // Clear privilege bits if needed and they are set. + if clearPrivilege && uattr.Perms.HasSetUIDOrGID() && !fs.IsDir(d.Inode.StableAttr) { + uattr.Perms.DropSetUIDAndMaybeGID() + if !d.Inode.SetPermissions(t, d, uattr.Perms) { + return syserror.EPERM + } + } - // When the owner or group are changed by an unprivileged user, - // chown(2) also clears the set-user-ID and set-group-ID bits, but - // we do not support them. return nil } @@ -2124,9 +2140,9 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, syserror.EFBIG } if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur { - t.SendSignal(&arch.SignalInfo{ + t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }) return 0, nil, syserror.EFBIG } diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 53b12dc41..27a7f7fe1 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -84,9 +84,9 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if !mayKill(t, target, sig) { return 0, nil, syserror.EPERM } - info := &arch.SignalInfo{ + info := &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, } info.SetPID(int32(target.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow())) @@ -123,9 +123,9 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // depend on the iteration order. We at least implement the // semantics documented by the man page: "On success (at least // one signal was sent), zero is returned." - info := &arch.SignalInfo{ + info := &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, } info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) @@ -167,9 +167,9 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC continue } - info := &arch.SignalInfo{ + info := &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, } info.SetPID(int32(tg.PIDNamespace().IDOfTask(t))) info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow())) @@ -184,10 +184,10 @@ func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } } -func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *arch.SignalInfo { - info := &arch.SignalInfo{ +func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *linux.SignalInfo { + info := &linux.SignalInfo{ Signo: int32(sig), - Code: arch.SignalInfoTkill, + Code: linux.SI_TKILL, } info.SetPID(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup()))) info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) @@ -251,20 +251,20 @@ func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, syserror.EINVAL } - var newactptr *arch.SignalAct + var newactptr *linux.SigAction if newactarg != 0 { - newact, err := t.CopyInSignalAct(newactarg) - if err != nil { + var newact linux.SigAction + if _, err := newact.CopyIn(t, newactarg); err != nil { return 0, nil, err } newactptr = &newact } - oldact, err := t.ThreadGroup().SetSignalAct(sig, newactptr) + oldact, err := t.ThreadGroup().SetSigAction(sig, newactptr) if err != nil { return 0, nil, err } if oldactarg != 0 { - if err := t.CopyOutSignalAct(oldactarg, &oldact); err != nil { + if _, err := oldact.CopyOut(t, oldactarg); err != nil { return 0, nil, err } } @@ -325,13 +325,12 @@ func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S alt := t.SignalStack() if oldaddr != 0 { - if err := t.CopyOutSignalStack(oldaddr, &alt); err != nil { + if _, err := alt.CopyOut(t, oldaddr); err != nil { return 0, nil, err } } if setaddr != 0 { - alt, err := t.CopyInSignalStack(setaddr) - if err != nil { + if _, err := alt.CopyIn(t, setaddr); err != nil { return 0, nil, err } // The signal stack cannot be changed if the task is currently @@ -410,7 +409,7 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // We must ensure that the Signo is set (Linux overrides this in the // same way), and that the code is in the allowed set. This same logic // appears below in RtSigtgqueueinfo and should be kept in sync. - var info arch.SignalInfo + var info linux.SignalInfo if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } @@ -426,7 +425,7 @@ func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. - if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { + if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { return 0, nil, syserror.EPERM } @@ -454,7 +453,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker } // Copy in the info. See RtSigqueueinfo above. - var info arch.SignalInfo + var info linux.SignalInfo if _, err := info.CopyIn(t, infoAddr); err != nil { return 0, nil, err } @@ -469,7 +468,7 @@ func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *ker // If the sender is not the receiver, it can't use si_codes used by the // kernel or SI_TKILL. - if (info.Code >= 0 || info.Code == arch.SignalInfoTkill) && target != t { + if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t { return 0, nil, syserror.EPERM } diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 3185ea527..0d5056303 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -398,7 +398,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // out the fields it would set for a successful waitid in this case // as well. if infop != 0 { - var si arch.SignalInfo + var si linux.SignalInfo _, err = si.CopyOut(t, infop) } } @@ -413,7 +413,7 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if infop == 0 { return 0, nil, nil } - si := arch.SignalInfo{ + si := linux.SignalInfo{ Signo: int32(linux.SIGCHLD), } si.SetPID(int32(wr.TID)) @@ -423,24 +423,24 @@ func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal s := unix.WaitStatus(wr.Status) switch { case s.Exited(): - si.Code = arch.CLD_EXITED + si.Code = linux.CLD_EXITED si.SetStatus(int32(s.ExitStatus())) case s.Signaled(): - si.Code = arch.CLD_KILLED + si.Code = linux.CLD_KILLED si.SetStatus(int32(s.Signal())) case s.CoreDump(): - si.Code = arch.CLD_DUMPED + si.Code = linux.CLD_DUMPED si.SetStatus(int32(s.Signal())) case s.Stopped(): if wr.Event == kernel.EventTraceeStop { - si.Code = arch.CLD_TRAPPED + si.Code = linux.CLD_TRAPPED si.SetStatus(int32(s.TrapCause())) } else { - si.Code = arch.CLD_STOPPED + si.Code = linux.CLD_STOPPED si.SetStatus(int32(s.StopSignal())) } case s.Continued(): - si.Code = arch.CLD_CONTINUED + si.Code = linux.CLD_CONTINUED si.SetStatus(int32(linux.SIGCONT)) default: t.Warningf("waitid got incomprehensible wait status %d", s) diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index 83b777bbd..5c3b3dee2 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -180,21 +180,21 @@ func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC // // +stateify savable type clockNanosleepRestartBlock struct { - c ktime.Clock - duration time.Duration - rem hostarch.Addr + c ktime.Clock + end ktime.Time + rem hostarch.Addr } // Restart implements kernel.SyscallRestartBlock.Restart. func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) { - return 0, clockNanosleepFor(t, n.c, n.duration, n.rem) + return 0, clockNanosleepUntil(t, n.c, n.end, n.rem, true) } // clockNanosleepUntil blocks until a specified time. // // If blocking is interrupted, the syscall is restarted with the original // arguments. -func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error { +func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem hostarch.Addr, needRestartBlock bool) error { notifier, tchan := ktime.NewChannelNotifier() timer := ktime.NewTimer(c, notifier) @@ -202,43 +202,22 @@ func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, ts linux.Timespec) error timer.Swap(ktime.Setting{ Period: 0, Enabled: true, - Next: ktime.FromTimespec(ts), + Next: end, }) err := t.BlockWithTimer(nil, tchan) timer.Destroy() - // Did we just block until the timeout happened? - if err == syserror.ETIMEDOUT { - return nil - } - - return syserror.ConvertIntr(err, syserror.ERESTARTNOHAND) -} - -// clockNanosleepFor blocks for a specified duration. -// -// If blocking is interrupted, the syscall is restarted with the remaining -// duration timeout. -func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem hostarch.Addr) error { - timer, start, tchan := ktime.After(c, dur) - - err := t.BlockWithTimer(nil, tchan) - - after := c.Now() - - timer.Destroy() - switch err { case syserror.ETIMEDOUT: // Slept for entire timeout. return nil case syserror.ErrInterrupted: // Interrupted. - remaining := dur - after.Sub(start) - if remaining < 0 { - remaining = time.Duration(0) + remaining := end.Sub(c.Now()) + if remaining <= 0 { + return nil } // Copy out remaining time. @@ -248,14 +227,16 @@ func clockNanosleepFor(t *kernel.Task, c ktime.Clock, dur time.Duration, rem hos return err } } - - // Arrange for a restart with the remaining duration. - t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{ - c: c, - duration: remaining, - rem: rem, - }) - return syserror.ERESTART_RESTARTBLOCK + if needRestartBlock { + // Arrange for a restart with the remaining duration. + t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{ + c: c, + end: end, + rem: rem, + }) + return syserror.ERESTART_RESTARTBLOCK + } + return syserror.ERESTARTNOHAND default: panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err)) } @@ -278,7 +259,8 @@ func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys // Just like linux, we cap the timeout with the max number that int64 can // represent which is roughly 292 years. dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond - return 0, nil, clockNanosleepFor(t, t.Kernel().MonotonicClock(), dur, rem) + c := t.Kernel().MonotonicClock() + return 0, nil, clockNanosleepUntil(t, c, c.Now().Add(dur), rem, true) } // ClockNanosleep implements linux syscall clock_nanosleep(2). @@ -312,11 +294,11 @@ func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } if flags&linux.TIMER_ABSTIME != 0 { - return 0, nil, clockNanosleepUntil(t, c, req) + return 0, nil, clockNanosleepUntil(t, c, ktime.FromTimespec(req), 0, false) } dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond - return 0, nil, clockNanosleepFor(t, c, dur, rem) + return 0, nil, clockNanosleepUntil(t, c, c.Now().Add(dur), rem, true) } // Gettimeofday implements linux syscall gettimeofday(2). diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index c6330c21a..647e089d0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -242,9 +242,9 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } limit := limits.FromContext(t).Get(limits.FileSize).Cur if uint64(size) >= limit { - t.SendSignal(&arch.SignalInfo{ + t.SendSignal(&linux.SignalInfo{ Signo: int32(linux.SIGXFSZ), - Code: arch.SignalInfoUser, + Code: linux.SI_USER, }) return 0, nil, syserror.EFBIG } diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index bc75a47fc..202486a1e 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -8,7 +8,7 @@ go_template_instance( out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", - template = "//pkg/sync:generic_seqatomic", + template = "//pkg/sync/seqatomic:generic_seqatomic", types = { "Value": "Parameters", }, diff --git a/pkg/sentry/time/sampler_amd64.go b/pkg/sentry/time/sampler_amd64.go index 72cb74be3..9f1b4b2fb 100644 --- a/pkg/sentry/time/sampler_amd64.go +++ b/pkg/sentry/time/sampler_amd64.go @@ -16,7 +16,7 @@ package time -const( +const ( // defaultOverheadTSC is the default estimated syscall overhead in TSC cycles. // It is further refined as syscalls are made. defaultOverheadCycles = 1 * 1000 diff --git a/pkg/sentry/time/sampler_arm64.go b/pkg/sentry/time/sampler_arm64.go index b9d0273b7..4c8d33ae4 100644 --- a/pkg/sentry/time/sampler_arm64.go +++ b/pkg/sentry/time/sampler_arm64.go @@ -30,7 +30,7 @@ func getDefaultArchOverheadCycles() TSCValue { // x86 devided by frqRatio cntfrq := getCNTFRQ() frqRatio := 1000000000 / cntfrq - overheadCycles := ( 1 * 1000 ) / frqRatio + overheadCycles := (1 * 1000) / frqRatio return overheadCycles } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 176bcc242..ef8d8a813 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -454,6 +454,9 @@ type FileDescriptionImpl interface { // RemoveXattr removes the given extended attribute from the file. RemoveXattr(ctx context.Context, name string) error + // SupportsLocks indicates whether file locks are supported. + SupportsLocks() bool + // LockBSD tries to acquire a BSD-style advisory file lock. LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block lock.Blocker) error @@ -818,6 +821,11 @@ func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) e return fd.Sync(ctx) } +// SupportsLocks indicates whether file locks are supported. +func (fd *FileDescription) SupportsLocks() bool { + return fd.impl.SupportsLocks() +} + // LockBSD tries to acquire a BSD-style advisory file lock. func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, blocker lock.Blocker) error { atomic.StoreUint32(&fd.usedLockBSD, 1) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index b87d9690a..2b6f47b4b 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -413,6 +413,11 @@ type LockFD struct { locks *FileLocks } +// SupportsLocks implements FileDescriptionImpl.SupportsLocks. +func (LockFD) SupportsLocks() bool { + return true +} + // Init initializes fd with FileLocks to use. func (fd *LockFD) Init(locks *FileLocks) { fd.locks = locks @@ -423,28 +428,28 @@ func (fd *LockFD) Locks() *FileLocks { return fd.locks } -// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +// LockBSD implements FileDescriptionImpl.LockBSD. func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { return fd.locks.LockBSD(ctx, uid, ownerPID, t, block) } -// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +// UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { fd.locks.UnlockBSD(uid) return nil } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +// LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { return fd.locks.LockPOSIX(ctx, uid, ownerPID, t, r, block) } -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return fd.locks.UnlockPOSIX(ctx, uid, r) } -// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +// TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { return fd.locks.TestPOSIX(ctx, uid, t, r) } @@ -455,27 +460,68 @@ func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.L // +stateify savable type NoLockFD struct{} -// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +// SupportsLocks implements FileDescriptionImpl.SupportsLocks. +func (NoLockFD) SupportsLocks() bool { + return false +} + +// LockBSD implements FileDescriptionImpl.LockBSD. func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { return syserror.ENOLCK } -// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +// UnlockBSD implements FileDescriptionImpl.UnlockBSD. func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return syserror.ENOLCK } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +// LockPOSIX implements FileDescriptionImpl.LockPOSIX. func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { return syserror.ENOLCK } -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { return syserror.ENOLCK } -// TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX. +// TestPOSIX implements FileDescriptionImpl.TestPOSIX. func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { return linux.Flock{}, syserror.ENOLCK } + +// BadLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface +// returning EBADF. +// +// +stateify savable +type BadLockFD struct{} + +// SupportsLocks implements FileDescriptionImpl.SupportsLocks. +func (BadLockFD) SupportsLocks() bool { + return false +} + +// LockBSD implements FileDescriptionImpl.LockBSD. +func (BadLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { + return syserror.EBADF +} + +// UnlockBSD implements FileDescriptionImpl.UnlockBSD. +func (BadLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return syserror.EBADF +} + +// LockPOSIX implements FileDescriptionImpl.LockPOSIX. +func (BadLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { + return syserror.EBADF +} + +// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. +func (BadLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { + return syserror.EBADF +} + +// TestPOSIX implements FileDescriptionImpl.TestPOSIX. +func (BadLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) { + return linux.Flock{}, syserror.EBADF +} diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD index d8c4d27b9..ea82f4987 100644 --- a/pkg/sentry/vfs/memxattr/BUILD +++ b/pkg/sentry/vfs/memxattr/BUILD @@ -8,6 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go index 638b5d830..9b7953fa3 100644 --- a/pkg/sentry/vfs/memxattr/xattr.go +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -17,7 +17,10 @@ package memxattr import ( + "strings" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -26,6 +29,9 @@ import ( // SimpleExtendedAttributes implements extended attributes using a map of // names to values. // +// SimpleExtendedAttributes calls vfs.CheckXattrPermissions, so callers are not +// required to do so. +// // +stateify savable type SimpleExtendedAttributes struct { // mu protects the below fields. @@ -34,7 +40,11 @@ type SimpleExtendedAttributes struct { } // GetXattr returns the value at 'name'. -func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) { +func (x *SimpleExtendedAttributes) GetXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, opts *vfs.GetXattrOptions) (string, error) { + if err := vfs.CheckXattrPermissions(creds, vfs.MayRead, mode, kuid, opts.Name); err != nil { + return "", err + } + x.mu.RLock() value, ok := x.xattrs[opts.Name] x.mu.RUnlock() @@ -50,7 +60,11 @@ func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, } // SetXattr sets 'value' at 'name'. -func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error { +func (x *SimpleExtendedAttributes) SetXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, opts *vfs.SetXattrOptions) error { + if err := vfs.CheckXattrPermissions(creds, vfs.MayWrite, mode, kuid, opts.Name); err != nil { + return err + } + x.mu.Lock() defer x.mu.Unlock() if x.xattrs == nil { @@ -73,12 +87,19 @@ func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error { } // ListXattr returns all names in xattrs. -func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) { +func (x *SimpleExtendedAttributes) ListXattr(creds *auth.Credentials, size uint64) ([]string, error) { // Keep track of the size of the buffer needed in listxattr(2) for the list. listSize := 0 x.mu.RLock() names := make([]string, 0, len(x.xattrs)) + haveCap := creds.HasCapability(linux.CAP_SYS_ADMIN) for n := range x.xattrs { + // Hide extended attributes in the "trusted" namespace from + // non-privileged users. This is consistent with Linux's + // fs/xattr.c:simple_xattr_list(). + if !haveCap && strings.HasPrefix(n, linux.XATTR_TRUSTED_PREFIX) { + continue + } names = append(names, n) // Add one byte per null terminator. listSize += len(n) + 1 @@ -91,7 +112,11 @@ func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) { } // RemoveXattr removes the xattr at 'name'. -func (x *SimpleExtendedAttributes) RemoveXattr(name string) error { +func (x *SimpleExtendedAttributes) RemoveXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, name string) error { + if err := vfs.CheckXattrPermissions(creds, vfs.MayWrite, mode, kuid, name); err != nil { + return err + } + x.mu.Lock() defer x.mu.Unlock() if _, ok := x.xattrs[name]; !ok { diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 82fd382c2..f93da3af1 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -220,7 +220,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr vdDentry := vd.dentry vdDentry.mu.Lock() for { - if vdDentry.dead { + if vd.mount.umounted || vdDentry.dead { vdDentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef(ctx) diff --git a/pkg/sentry/vfs/opath.go b/pkg/sentry/vfs/opath.go index 47848c76b..e9651b631 100644 --- a/pkg/sentry/vfs/opath.go +++ b/pkg/sentry/vfs/opath.go @@ -24,96 +24,96 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// opathFD implements vfs.FileDescriptionImpl for a file description opened with O_PATH. +// opathFD implements FileDescriptionImpl for a file description opened with O_PATH. // // +stateify savable type opathFD struct { vfsfd FileDescription FileDescriptionDefaultImpl - NoLockFD + BadLockFD } -// Release implements vfs.FileDescriptionImpl.Release. +// Release implements FileDescriptionImpl.Release. func (fd *opathFD) Release(context.Context) { // noop } -// Allocate implements vfs.FileDescriptionImpl.Allocate. +// Allocate implements FileDescriptionImpl.Allocate. func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error { return syserror.EBADF } -// PRead implements vfs.FileDescriptionImpl.PRead. +// PRead implements FileDescriptionImpl.PRead. func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.EBADF } -// Read implements vfs.FileDescriptionImpl.Read. +// Read implements FileDescriptionImpl.Read. func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { return 0, syserror.EBADF } -// PWrite implements vfs.FileDescriptionImpl.PWrite. +// PWrite implements FileDescriptionImpl.PWrite. func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { return 0, syserror.EBADF } -// Write implements vfs.FileDescriptionImpl.Write. +// Write implements FileDescriptionImpl.Write. func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { return 0, syserror.EBADF } -// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +// Ioctl implements FileDescriptionImpl.Ioctl. func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { return 0, syserror.EBADF } -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +// IterDirents implements FileDescriptionImpl.IterDirents. func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error { return syserror.EBADF } -// Seek implements vfs.FileDescriptionImpl.Seek. +// Seek implements FileDescriptionImpl.Seek. func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { return 0, syserror.EBADF } -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap. func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return syserror.EBADF } -// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +// ListXattr implements FileDescriptionImpl.ListXattr. func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) { return nil, syserror.EBADF } -// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +// GetXattr implements FileDescriptionImpl.GetXattr. func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { return "", syserror.EBADF } -// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +// SetXattr implements FileDescriptionImpl.SetXattr. func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error { return syserror.EBADF } -// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +// RemoveXattr implements FileDescriptionImpl.RemoveXattr. func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error { return syserror.EBADF } -// Sync implements vfs.FileDescriptionImpl.Sync. +// Sync implements FileDescriptionImpl.Sync. func (fd *opathFD) Sync(ctx context.Context) error { return syserror.EBADF } -// SetStat implements vfs.FileDescriptionImpl.SetStat. +// SetStat implements FileDescriptionImpl.SetStat. func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error { return syserror.EBADF } -// Stat implements vfs.FileDescriptionImpl.Stat. +// Stat implements FileDescriptionImpl.Stat. func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { vfsObj := fd.vfsfd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 8e3146d8d..8d563d53a 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -115,14 +115,14 @@ func (a *Action) Get() interface{} { } // String returns Action's string representation. -func (a *Action) String() string { - switch *a { +func (a Action) String() string { + switch a { case LogWarning: return "logWarning" case Panic: return "panic" default: - panic(fmt.Sprintf("Invalid watchdog action: %d", *a)) + panic(fmt.Sprintf("Invalid watchdog action: %d", a)) } } @@ -243,6 +243,7 @@ func (w *Watchdog) waitForStart() { } stuckStartup.Increment() + metric.WeirdnessMetric.Increment("watchdog_stuck_startup") var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) @@ -312,10 +313,11 @@ func (w *Watchdog) runTurn() { // New stuck task detected. // // Note that tasks blocked doing IO may be considered stuck in kernel, - // unless they are surrounded b + // unless they are surrounded by // Task.UninterruptibleSleepStart/Finish. tc = &offender{lastUpdateTime: lastUpdateTime} stuckTasks.Increment() + metric.WeirdnessMetric.Increment("watchdog_stuck_tasks") newTaskFound = true } newOffenders[t] = tc |