summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/fd_table.go55
-rw-r--r--pkg/sentry/kernel/shm/shm.go2
-rw-r--r--pkg/sentry/kernel/syscalls.go36
-rw-r--r--pkg/sentry/kernel/syscalls_state.go36
-rw-r--r--pkg/sentry/kernel/task.go9
-rw-r--r--pkg/sentry/kernel/task_context.go3
-rw-r--r--pkg/sentry/kernel/task_identity.go2
-rw-r--r--pkg/sentry/kernel/task_signals.go2
-rw-r--r--pkg/sentry/kernel/time/time.go10
9 files changed, 115 insertions, 40 deletions
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index d09d97825..ed40b5303 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -307,6 +307,61 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
return fds, nil
}
+// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
+// greater than or equal to the fd parameter. All files will share the set
+// flags. Success is guaranteed to be all or none.
+func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
+ if fd < 0 {
+ // Don't accept negative FDs.
+ return nil, syscall.EINVAL
+ }
+
+ // Default limit.
+ end := int32(math.MaxInt32)
+
+ // Ensure we don't get past the provided limit.
+ if limitSet := limits.FromContext(ctx); limitSet != nil {
+ lim := limitSet.Get(limits.NumberOfFiles)
+ if lim.Cur != limits.Infinity {
+ end = int32(lim.Cur)
+ }
+ if fd >= end {
+ return nil, syscall.EMFILE
+ }
+ }
+
+ f.mu.Lock()
+ defer f.mu.Unlock()
+
+ // From f.next to find available fd.
+ if fd < f.next {
+ fd = f.next
+ }
+
+ // Install all entries.
+ for i := fd; i < end && len(fds) < len(files); i++ {
+ if d, _, _ := f.getVFS2(i); d == nil {
+ f.setVFS2(i, files[len(fds)], flags) // Set the descriptor.
+ fds = append(fds, i) // Record the file descriptor.
+ }
+ }
+
+ // Failure? Unwind existing FDs.
+ if len(fds) < len(files) {
+ for _, i := range fds {
+ f.setVFS2(i, nil, FDFlags{}) // Zap entry.
+ }
+ return nil, syscall.EMFILE
+ }
+
+ if fd == f.next {
+ // Update next search start position.
+ f.next = fds[len(fds)-1] + 1
+ }
+
+ return fds, nil
+}
+
// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
// the given file description. If it succeeds, it takes a reference on file.
func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 208569057..f66cfcc7f 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -461,7 +461,7 @@ func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.A
func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ usermem.AddrRange, _ uint64, _ bool) {
s.mu.Lock()
defer s.mu.Unlock()
- // TODO(b/38173783): RemoveMapping may be called during task exit, when ctx
+ // RemoveMapping may be called during task exit, when ctx
// is context.Background. Gracefully handle missing clocks. Failing to
// update the detach time in these cases is ok, since no one can observe the
// omission.
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 93c4fe969..2e3565747 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -209,65 +209,61 @@ type Stracer interface {
// SyscallEnter is called on syscall entry.
//
// The returned private data is passed to SyscallExit.
- //
- // TODO(gvisor.dev/issue/155): remove kernel imports from the strace
- // package so that the type can be used directly.
SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}
// SyscallExit is called on syscall exit.
SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
}
-// SyscallTable is a lookup table of system calls. Critically, a SyscallTable
-// is *immutable*. In order to make supporting suspend and resume sane, they
-// must be uniquely registered and may not change during operation.
+// SyscallTable is a lookup table of system calls.
//
-// +stateify savable
+// Note that a SyscallTable is not savable directly. Instead, they are saved as
+// an OS/Arch pair and lookup happens again on restore.
type SyscallTable struct {
// OS is the operating system that this syscall table implements.
- OS abi.OS `state:"wait"`
+ OS abi.OS
// Arch is the architecture that this syscall table targets.
- Arch arch.Arch `state:"wait"`
+ Arch arch.Arch
// The OS version that this syscall table implements.
- Version Version `state:"manual"`
+ Version Version
// AuditNumber is a numeric constant that represents the syscall table. If
// non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
// linux/audit.h.
- AuditNumber uint32 `state:"manual"`
+ AuditNumber uint32
// Table is the collection of functions.
- Table map[uintptr]Syscall `state:"manual"`
+ Table map[uintptr]Syscall
// lookup is a fixed-size array that holds the syscalls (indexed by
// their numbers). It is used for fast look ups.
- lookup []SyscallFn `state:"manual"`
+ lookup []SyscallFn
// Emulate is a collection of instruction addresses to emulate. The
// keys are addresses, and the values are system call numbers.
- Emulate map[usermem.Addr]uintptr `state:"manual"`
+ Emulate map[usermem.Addr]uintptr
// The function to call in case of a missing system call.
- Missing MissingFn `state:"manual"`
+ Missing MissingFn
// Stracer traces this syscall table.
- Stracer Stracer `state:"manual"`
+ Stracer Stracer
// External is used to handle an external callback.
- External func(*Kernel) `state:"manual"`
+ External func(*Kernel)
// ExternalFilterBefore is called before External is called before the syscall is executed.
// External is not called if it returns false.
- ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+ ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool
// ExternalFilterAfter is called before External is called after the syscall is executed.
// External is not called if it returns false.
- ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"`
+ ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool
// FeatureEnable stores the strace and one-shot enable bits.
- FeatureEnable SyscallFlagsTable `state:"manual"`
+ FeatureEnable SyscallFlagsTable
}
// allSyscallTables contains all known tables.
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 00358326b..90f890495 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -14,16 +14,34 @@
package kernel
-import "fmt"
+import (
+ "fmt"
-// afterLoad is invoked by stateify.
-func (s *SyscallTable) afterLoad() {
- otherTable, ok := LookupSyscallTable(s.OS, s.Arch)
- if !ok {
- // Couldn't find a reference?
- panic(fmt.Sprintf("syscall table not found for OS %v Arch %v", s.OS, s.Arch))
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// syscallTableInfo is used to reload the SyscallTable.
+//
+// +stateify savable
+type syscallTableInfo struct {
+ OS abi.OS
+ Arch arch.Arch
+}
+
+// saveSt saves the SyscallTable.
+func (tc *TaskContext) saveSt() syscallTableInfo {
+ return syscallTableInfo{
+ OS: tc.st.OS,
+ Arch: tc.st.Arch,
}
+}
- // Copy the table.
- *s = *otherTable
+// loadSt loads the SyscallTable.
+func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+ st, ok := LookupSyscallTable(sti.OS, sti.Arch)
+ if !ok {
+ panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
+ }
+ tc.st = st // Save the table reference.
}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index d6546735e..e5d133d6c 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -777,6 +777,15 @@ func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error
return t.fdTable.NewFDs(t, fd, files, flags)
}
+// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
+//
+// This automatically passes the task as the context.
+//
+// Precondition: same as FDTable.
+func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
+ return t.fdTable.NewFDsVFS2(t, fd, files, flags)
+}
+
// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
//
// This automatically passes the task as the context.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 0158b1788..9fa528384 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -49,7 +49,7 @@ type TaskContext struct {
fu *futex.Manager
// st is the task's syscall table.
- st *SyscallTable
+ st *SyscallTable `state:".(syscallTableInfo)"`
}
// release releases all resources held by the TaskContext. release is called by
@@ -58,7 +58,6 @@ func (tc *TaskContext) release() {
// Nil out pointers so that if the task is saved after release, it doesn't
// follow the pointers to possibly now-invalid objects.
if tc.MemoryManager != nil {
- // TODO(b/38173783)
tc.MemoryManager.DecUsers(context.Background())
tc.MemoryManager = nil
}
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index ce3e6ef28..0325967e4 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -455,7 +455,7 @@ func (t *Task) SetKeepCaps(k bool) {
t.creds.Store(creds)
}
-// updateCredsForExec updates t.creds to reflect an execve().
+// updateCredsForExecLocked updates t.creds to reflect an execve().
//
// NOTE(b/30815691): We currently do not implement privileged executables
// (set-user/group-ID bits and file capabilities). This allows us to make a lot
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index 8802db142..6aa798346 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -513,8 +513,6 @@ func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
if t.stop != nil {
return false
}
- // - TODO(b/38173783): No special case for when t is also the sending task,
- // because the identity of the sender is unknown.
// - Do not choose tasks that have already been interrupted, as they may be
// busy handling another signal.
if len(t.interruptChan) != 0 {
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 706de83ef..e959700f2 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -245,7 +245,7 @@ type Clock interface {
type WallRateClock struct{}
// WallTimeUntil implements Clock.WallTimeUntil.
-func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
+func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
return t.Sub(now)
}
@@ -254,16 +254,16 @@ func (WallRateClock) WallTimeUntil(t, now Time) time.Duration {
type NoClockEvents struct{}
// Readiness implements waiter.Waitable.Readiness.
-func (NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
return 0
}
// EventRegister implements waiter.Waitable.EventRegister.
-func (NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
}
// EventUnregister implements waiter.Waitable.EventUnregister.
-func (NoClockEvents) EventUnregister(e *waiter.Entry) {
+func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
}
// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
@@ -273,7 +273,7 @@ type ClockEventsQueue struct {
}
// Readiness implements waiter.Waitable.Readiness.
-func (ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
+func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
return 0
}