diff options
Diffstat (limited to 'pkg/sentry/kernel/syscalls.go')
-rw-r--r-- | pkg/sentry/kernel/syscalls.go | 305 |
1 files changed, 305 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go new file mode 100644 index 000000000..e20fa3eb6 --- /dev/null +++ b/pkg/sentry/kernel/syscalls.go @@ -0,0 +1,305 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi" + "gvisor.googlesource.com/gvisor/pkg/bits" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// maxSyscallNum is the highest supported syscall number. +// +// The types below create fast lookup slices for all syscalls. This maximum +// serves as a sanity check that we don't allocate huge slices for a very large +// syscall. +const maxSyscallNum = 2000 + +// SyscallFn is a syscall implementation. +type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) + +// MissingFn is a syscall to be called when an implementation is missing. +type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) + +// Possible flags for SyscallFlagsTable.enable. +const ( + // syscallPresent indicates that this is not a missing syscall. + // + // This flag is used internally in SyscallFlagsTable. + syscallPresent = 1 << iota + + // StraceEnableLog enables syscall log tracing. + StraceEnableLog + + // StraceEnableEvent enables syscall event tracing. + StraceEnableEvent + + // ExternalBeforeEnable enables the external hook before syscall execution. + ExternalBeforeEnable + + // ExternalAfterEnable enables the external hook after syscall execution. + ExternalAfterEnable +) + +// StraceEnableBits combines both strace log and event flags. +const StraceEnableBits = StraceEnableLog | StraceEnableEvent + +// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall +// basis. +type SyscallFlagsTable struct { + // mu protects writes to the fields below. + // + // Atomic loads are always allowed. Atomic stores are allowed only + // while mu is held. + mu sync.Mutex + + // enable contains the enable bits for each syscall. + // + // missing syscalls have the same value in enable as missingEnable to + // avoid an extra branch in Word. + enable []uint32 + + // missingEnable contains the enable bits for missing syscalls. + missingEnable uint32 +} + +// Init initializes the struct, with all syscalls in table set to enable. +// +// max is the largest syscall number in table. +func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) { + e.enable = make([]uint32, max+1) + for num := range table { + e.enable[num] = syscallPresent + } +} + +// Word returns the enable bitfield for sysno. +func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { + if sysno < uintptr(len(e.enable)) { + return atomic.LoadUint32(&e.enable[sysno]) + } + + return atomic.LoadUint32(&e.missingEnable) +} + +// Enable sets enable bit bit for all syscalls based on s. +// +// Syscalls missing from s are disabled. +// +// Syscalls missing from the initial table passed to Init cannot be added as +// individual syscalls. If present in s they will be ignored. +// +// Callers to Word may see either the old or new value while this function +// is executing. +func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + if missingEnable { + missingVal |= bit + } else { + missingVal &^= bit + } + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + if s[uintptr(num)] { + val |= bit + } else { + val &^= bit + } + atomic.StoreUint32(&e.enable[num], val) + } +} + +// EnableAll sets enable bit bit for all syscalls, present and missing. +func (e *SyscallFlagsTable) EnableAll(bit uint32) { + e.mu.Lock() + defer e.mu.Unlock() + + missingVal := atomic.LoadUint32(&e.missingEnable) + missingVal |= bit + atomic.StoreUint32(&e.missingEnable, missingVal) + + for num := range e.enable { + val := atomic.LoadUint32(&e.enable[num]) + if !bits.IsOn32(val, syscallPresent) { + // Missing. + atomic.StoreUint32(&e.enable[num], missingVal) + continue + } + + val |= bit + atomic.StoreUint32(&e.enable[num], val) + } +} + +// Stracer traces syscall execution. +type Stracer interface { + // SyscallEnter is called on syscall entry. + // + // The returned private data is passed to SyscallExit. + // + // TODO: remove kernel imports from the strace package so + // that the type can be used directly. + SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} + + // SyscallExit is called on syscall exit. + SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) +} + +// SyscallTable is a lookup table of system calls. Critically, a SyscallTable +// is *immutable*. In order to make supporting suspend and resume sane, they +// must be uniquely registered and may not change during operation. +type SyscallTable struct { + // OS is the operating system that this syscall table implements. + OS abi.OS `state:"wait"` + + // Arch is the architecture that this syscall table targets. + Arch arch.Arch `state:"wait"` + + // The OS version that this syscall table implements. + Version Version `state:"manual"` + + // AuditNumber is a numeric constant that represents the syscall table. If + // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by + // linux/audit.h. + AuditNumber uint32 `state:"manual"` + + // Table is the collection of functions. + Table map[uintptr]SyscallFn `state:"manual"` + + // lookup is a fixed-size array that holds the syscalls (indexed by + // their numbers). It is used for fast look ups. + lookup []SyscallFn `state:"manual"` + + // Emulate is a collection of instruction addresses to emulate. The + // keys are addresses, and the values are system call numbers. + Emulate map[usermem.Addr]uintptr `state:"manual"` + + // The function to call in case of a missing system call. + Missing MissingFn `state:"manual"` + + // Stracer traces this syscall table. + Stracer Stracer `state:"manual"` + + // External is used to handle an external callback. + External func(*Kernel) `state:"manual"` + + // ExternalFilterBefore is called before External is called before the syscall is executed. + // External is not called if it returns false. + ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + + // ExternalFilterAfter is called before External is called after the syscall is executed. + // External is not called if it returns false. + ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` + + // FeatureEnable stores the strace and one-shot enable bits. + FeatureEnable SyscallFlagsTable `state:"manual"` +} + +// allSyscallTables contains all known tables. +var allSyscallTables []*SyscallTable + +// SyscallTables returns a read-only slice of registered SyscallTables. +func SyscallTables() []*SyscallTable { + return allSyscallTables +} + +// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. +func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { + for _, s := range allSyscallTables { + if s.OS == os && s.Arch == a { + return s, true + } + } + return nil, false +} + +// RegisterSyscallTable registers a new syscall table for use by a Kernel. +func RegisterSyscallTable(s *SyscallTable) { + if s.Table == nil { + // Ensure non-nil lookup table. + s.Table = make(map[uintptr]SyscallFn) + } + if s.Emulate == nil { + // Ensure non-nil emulate table. + s.Emulate = make(map[usermem.Addr]uintptr) + } + + var max uintptr + for num := range s.Table { + if num > max { + max = num + } + } + + if max > maxSyscallNum { + panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) + } + + s.lookup = make([]SyscallFn, max+1) + + // Initialize the fast-lookup table. + for num, fn := range s.Table { + s.lookup[num] = fn + } + + s.FeatureEnable.init(s.Table, max) + + if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { + panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) + } + + // Save a reference to this table. + // + // This is required for a Kernel to find the table and for save/restore + // operations below. + allSyscallTables = append(allSyscallTables, s) +} + +// Lookup returns the syscall implementation, if one exists. +func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { + if sysno < uintptr(len(s.lookup)) { + return s.lookup[sysno] + } + + return nil +} + +// LookupEmulate looks up an emulation syscall number. +func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { + sysno, ok := s.Emulate[addr] + return sysno, ok +} + +// mapLookup is similar to Lookup, except that it only uses the syscall table, +// that is, it skips the fast look array. This is available for benchmarking. +func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { + return s.Table[sysno] +} |