diff options
Diffstat (limited to 'pkg/sentry/kernel/kernel.go')
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 157 |
1 files changed, 70 insertions, 87 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 4c2d48e65..8c1f79ab5 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -112,11 +112,6 @@ type Kernel struct { rootIPCNamespace *IPCNamespace rootAbstractSocketNamespace *AbstractSocketNamespace - // mounts holds the state of the virtual filesystem. mounts is initially - // nil, and must be set by calling Kernel.SetRootMountNamespace before - // Kernel.CreateProcess can succeed. - mounts *fs.MountNamespace - // futexes is the "root" futex.Manager, from which all others are forked. // This is necessary to ensure that shared futexes are coherent across all // tasks, including those created by CreateProcess. @@ -197,6 +192,15 @@ type Kernel struct { // caches. Not all caches use it, only the caches that use host resources use // the limiter. It may be nil if disabled. DirentCacheLimiter *fs.DirentCacheLimiter + + // unimplementedSyscallEmitterOnce is used in the initialization of + // unimplementedSyscallEmitter. + unimplementedSyscallEmitterOnce sync.Once `state:"nosave"` + + // unimplementedSyscallEmitter is used to emit unimplemented syscall + // events. This is initialized lazily on the first unimplemented + // syscall. + unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"` } // InitKernelArgs holds arguments to Init. @@ -290,7 +294,6 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() - return nil } @@ -384,11 +387,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. func (k *Kernel) flushMountSourceRefs() error { - // Flush all mount sources for currently mounted filesystems in the - // root mount namespace. - k.mounts.FlushMountSourceRefs() - - // Some tasks may have other mount namespaces; flush those as well. + // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { @@ -497,7 +496,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { } // LoadFrom returns a new Kernel loaded from args. -func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { +func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { loadStart := time.Now() k.networkStack = net @@ -541,6 +540,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { log.Infof("Overall load took [%s]", time.Since(loadStart)) + k.Timekeeper().SetClocks(clocks) + if net != nil { + net.Resume() + } + // Ensure that all pending asynchronous work is complete: // - namedpipe opening // - inode file opening @@ -550,7 +554,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { tcpip.AsyncLoading.Wait() - log.Infof("Overall load took [%s]", time.Since(loadStart)) + log.Infof("Overall load took [%s] after async work", time.Since(loadStart)) // Applications may size per-cpu structures based on k.applicationCores, so // it can't change across save/restore. When we are virtualizing CPU @@ -565,16 +569,6 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { return nil } -// Destroy releases resources owned by k. -// -// Preconditions: There must be no task goroutines running in k. -func (k *Kernel) Destroy() { - if k.mounts != nil { - k.mounts.DecRef() - k.mounts = nil - } -} - // UniqueID returns a unique identifier. func (k *Kernel) UniqueID() uint64 { id := atomic.AddUint64(&k.uniqueID, 1) @@ -586,11 +580,17 @@ func (k *Kernel) UniqueID() uint64 { // CreateProcessArgs holds arguments to kernel.CreateProcess. type CreateProcessArgs struct { - // Filename is the filename to load. + // Filename is the filename to load as the init binary. // - // If this is provided as "", then the file will be guessed via Argv[0]. + // If this is provided as "", File will be checked, then the file will be + // guessed via Argv[0]. Filename string + // File is a passed host FD pointing to a file to load as the init binary. + // + // This is checked if and only if Filename is "". + File *fs.File + // Argvv is a list of arguments. Argv []string @@ -632,19 +632,12 @@ type CreateProcessArgs struct { AbstractSocketNamespace *AbstractSocketNamespace // MountNamespace optionally contains the mount namespace for this - // process. If nil, the kernel's mount namespace is used. + // process. If nil, the init process's mount namespace is used. // // Anyone setting MountNamespace must donate a reference (i.e. // increment it). MountNamespace *fs.MountNamespace - // Root optionally contains the dirent that serves as the root for the - // process. If nil, the mount namespace's root is used as the process' - // root. - // - // Anyone setting Root must donate a reference (i.e. increment it). - Root *fs.Dirent - // ContainerID is the container that the process belongs to. ContainerID string } @@ -682,16 +675,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: - if ctx.args.Root != nil { - // Take a reference on the root dirent that will be - // given to the caller. - ctx.args.Root.IncRef() - return ctx.args.Root - } - if ctx.k.mounts != nil { - // MountNamespace.Root() will take a reference on the - // root dirent for us. - return ctx.k.mounts.Root() + if ctx.args.MountNamespace != nil { + // MountNamespace.Root() will take a reference on the root + // dirent for us. + return ctx.args.MountNamespace.Root() } return nil case fs.CtxDirentCacheLimiter: @@ -735,30 +722,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) - if k.mounts == nil { - return nil, 0, fmt.Errorf("no kernel MountNamespace") - } - // Grab the mount namespace. mounts := args.MountNamespace if mounts == nil { - // If no MountNamespace was configured, then use the kernel's - // root mount namespace, with an extra reference that will be - // donated to the task. - mounts = k.mounts + mounts = k.GlobalInit().Leader().MountNamespace() mounts.IncRef() } tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) ctx := args.NewContext(k) - // Grab the root directory. - root := args.Root - if root == nil { - // If no Root was configured, then get it from the - // MountNamespace. - root = mounts.Root() - } + // Get the root directory from the MountNamespace. + root := mounts.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. defer root.DecRef() @@ -768,15 +743,23 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, wd := root // Default. if args.WorkingDirectory != "" { var err error - wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } defer wd.DecRef() } - if args.Filename == "" { - // Was anything provided? + // Check which file to start from. + switch { + case args.Filename != "": + // If a filename is given, take that. + // Set File to nil so we resolve the path in LoadTaskImage. + args.File = nil + case args.File != nil: + // If File is set, take the File provided directly. + default: + // Otherwise look at Argv and see if the first argument is a valid path. if len(args.Argv) == 0 { return nil, 0, fmt.Errorf("no filename or command provided") } @@ -788,7 +771,8 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, // Create a fresh task context. remainingTraversals = uint(args.MaxSymlinkTraversals) - tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet) + + tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet) if se != nil { return nil, 0, errors.New(se.String()) } @@ -1032,20 +1016,6 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { return k.rootAbstractSocketNamespace } -// RootMountNamespace returns the MountNamespace. -func (k *Kernel) RootMountNamespace() *fs.MountNamespace { - k.extMu.Lock() - defer k.extMu.Unlock() - return k.mounts -} - -// SetRootMountNamespace sets the MountNamespace. -func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) { - k.extMu.Lock() - defer k.extMu.Unlock() - k.mounts = mounts -} - // NetworkStack returns the network stack. NetworkStack may return nil if no // network stack is available. func (k *Kernel) NetworkStack() inet.Stack { @@ -1168,16 +1138,6 @@ func (k *Kernel) SupervisorContext() context.Context { } } -// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event -// channel. -func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { - t := TaskFromContext(ctx) - eventchannel.Emit(&uspb.UnimplementedSyscall{ - Tid: int32(t.ThreadID()), - Registers: t.Arch().StateData().Proto(), - }) -} - // SocketEntry represents a socket recorded in Kernel.sockets. It implements // refs.WeakRefUser for sockets stored in the socket table. // @@ -1246,7 +1206,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) case fs.CtxRoot: - return ctx.k.mounts.Root() + if ctx.k.globalInit != nil { + return ctx.k.globalInit.mounts.Root() + } + return nil case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: @@ -1272,3 +1235,23 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return nil } } + +// Rate limits for the number of unimplemented syscall events. +const ( + unimplementedSyscallsMaxRate = 100 // events per second + unimplementedSyscallBurst = 1000 // events +) + +// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event +// channel. +func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { + k.unimplementedSyscallEmitterOnce.Do(func() { + k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst) + }) + + t := TaskFromContext(ctx) + k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{ + Tid: int32(t.ThreadID()), + Registers: t.Arch().StateData().Proto(), + }) +} |