// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "bytes" "encoding/json" "fmt" "sort" "strings" "text/tabwriter" "time" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsbridge" hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/urpc" ) // Proc includes task-related functions. // // At the moment, this is limited to exec support. type Proc struct { Kernel *kernel.Kernel } // ExecArgs is the set of arguments to exec. type ExecArgs struct { // Filename is the filename to load. // // If this is provided as "", then the file will be guessed via Argv[0]. Filename string `json:"filename"` // Argv is a list of arguments. Argv []string `json:"argv"` // Envv is a list of environment variables. Envv []string `json:"envv"` // MountNamespace is the mount namespace to execute the new process in. // A reference on MountNamespace must be held for the lifetime of the // ExecArgs. If MountNamespace is nil, it will default to the init // process's MountNamespace. MountNamespace *fs.MountNamespace // MountNamespaceVFS2 is the mount namespace to execute the new process in. // A reference on MountNamespace must be held for the lifetime of the // ExecArgs. If MountNamespace is nil, it will default to the init // process's MountNamespace. MountNamespaceVFS2 *vfs.MountNamespace // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` // KUID is the UID to run with in the root user namespace. Defaults to // root if not set explicitly. KUID auth.KUID // KGID is the GID to run with in the root user namespace. Defaults to // the root group if not set explicitly. KGID auth.KGID // ExtraKGIDs is the list of additional groups to which the user belongs. ExtraKGIDs []auth.KGID // Capabilities is the list of capabilities to give to the process. Capabilities *auth.TaskCapabilities // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. StdioIsPty bool // FilePayload determines the files to give to the new process. urpc.FilePayload // ContainerID is the container for the process being executed. ContainerID string // PIDNamespace is the pid namespace for the process being executed. PIDNamespace *kernel.PIDNamespace } // String prints the arguments as a string. func (args ExecArgs) String() string { a := make([]string, len(args.Argv)) copy(a, args.Argv) if args.Filename != "" { a[0] = args.Filename } return strings.Join(a, " ") } // Exec runs a new task. func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { newTG, _, _, _, err := proc.execAsync(args) if err != nil { return err } // Wait for completion. newTG.WaitExited() *waitStatus = newTG.ExitStatus().Status() return nil } // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined // as a function rather than a method to avoid exposing execAsync as an RPC. func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { return proc.execAsync(args) } // execAsync runs a new task, but doesn't wait for it to finish. It returns the // newly created thread group and its PID. If the stdio FDs are TTYs, then a // TTYFileOperations that wraps the TTY is also returned. func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() defer fdTable.DecRef() creds := auth.NewUserCredentials( args.KUID, args.KGID, args.ExtraKGIDs, args.Capabilities, proc.Kernel.RootUserNamespace()) initArgs := kernel.CreateProcessArgs{ Filename: args.Filename, Argv: args.Argv, Envv: args.Envv, WorkingDirectory: args.WorkingDirectory, MountNamespace: args.MountNamespace, MountNamespaceVFS2: args.MountNamespaceVFS2, Credentials: creds, FDTable: fdTable, Umask: 0022, Limits: limits.NewLimitSet(), MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: proc.Kernel.RootUTSNamespace(), IPCNamespace: proc.Kernel.RootIPCNamespace(), AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(), ContainerID: args.ContainerID, PIDNamespace: args.PIDNamespace, } if initArgs.MountNamespace != nil { // initArgs must hold a reference on MountNamespace, which will // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } if initArgs.MountNamespaceVFS2 != nil { // initArgs must hold a reference on MountNamespaceVFS2, which will // be donated to the new process in CreateProcess. initArgs.MountNamespaceVFS2.IncRef() } ctx := initArgs.NewContext(proc.Kernel) if initArgs.Filename == "" { if kernel.VFS2Enabled { // Get the full path to the filename from the PATH env variable. if initArgs.MountNamespaceVFS2 == nil { // Set initArgs so that 'ctx' returns the namespace. // // MountNamespaceVFS2 adds a reference to the namespace, which is // transferred to the new process. initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() } file, err := getExecutableFD(ctx, creds, proc.Kernel.VFS(), initArgs.MountNamespaceVFS2, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0]) if err != nil { return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in environment %v: %v", initArgs.Argv[0], initArgs.Envv, err) } initArgs.File = fsbridge.NewVFSFile(file) } else { if initArgs.MountNamespace == nil { // Set initArgs so that 'ctx' returns the namespace. initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() // initArgs must hold a reference on MountNamespace, which will // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } f, err := user.ResolveExecutablePath(ctx, creds, initArgs.MountNamespace, initArgs.Envv, initArgs.WorkingDirectory, initArgs.Argv[0]) if err != nil { return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], initArgs.Envv, err) } initArgs.Filename = f } } fds := make([]int, len(args.FilePayload.Files)) for i, file := range args.FilePayload.Files { if kernel.VFS2Enabled { // Need to dup to remove ownership from os.File. dup, err := unix.Dup(int(file.Fd())) if err != nil { return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err) } fds[i] = dup } else { // VFS1 dups the file on import. fds[i] = int(file.Fd()) } } ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) if err != nil { if kernel.VFS2Enabled { for _, fd := range fds { unix.Close(fd) } } return nil, 0, nil, nil, err } tg, tid, err := proc.Kernel.CreateProcess(initArgs) if err != nil { return nil, 0, nil, nil, err } // Set the foreground process group on the TTY before starting the process. switch { case ttyFile != nil: ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) case ttyFileVFS2 != nil: ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) } // Start the newly created process. proc.Kernel.StartProcess(tg) return tg, tid, ttyFile, ttyFileVFS2, nil } // PsArgs is the set of arguments to ps. type PsArgs struct { // JSON will force calls to Ps to return the result as a JSON payload. JSON bool } // Ps provides a process listing for the running kernel. func (proc *Proc) Ps(args *PsArgs, out *string) error { var p []*Process if e := Processes(proc.Kernel, "", &p); e != nil { return e } if !args.JSON { *out = ProcessListToTable(p) } else { s, e := ProcessListToJSON(p) if e != nil { return e } *out = s } return nil } // Process contains information about a single process in a Sandbox. type Process struct { UID auth.KUID `json:"uid"` PID kernel.ThreadID `json:"pid"` // Parent PID PPID kernel.ThreadID `json:"ppid"` Threads []kernel.ThreadID `json:"threads"` // Processor utilization C int32 `json:"c"` // TTY name of the process. Will be of the form "pts/N" if there is a // TTY, or "?" if there is not. TTY string `json:"tty"` // Start time STime string `json:"stime"` // CPU time Time string `json:"time"` // Executable shortname (e.g. "sh" for /bin/sh) Cmd string `json:"cmd"` } // ProcessListToTable prints a table with the following format: // UID PID PPID C TTY STIME TIME CMD // 0 1 0 0 pty/4 14:04 505262ns tail func ProcessListToTable(pl []*Process) string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") for _, d := range pl { fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", d.UID, d.PID, d.PPID, d.C, d.TTY, d.STime, d.Time, d.Cmd) } tw.Flush() return buf.String() } // ProcessListToJSON will return the JSON representation of ps. func ProcessListToJSON(pl []*Process) (string, error) { b, err := json.MarshalIndent(pl, "", " ") if err != nil { return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) } return string(b), nil } // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This // behavior is the same as runc's. func PrintPIDsJSON(pl []*Process) (string, error) { pids := make([]kernel.ThreadID, 0, len(pl)) for _, d := range pl { pids = append(pids, d.PID) } b, err := json.Marshal(pids) if err != nil { return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err) } return string(b), nil } // Processes retrieves information about processes running in the sandbox with // the given container id. All processes are returned if 'containerID' is empty. func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { ts := k.TaskSet() now := k.RealtimeClock().Now() for _, tg := range ts.Root.ThreadGroups() { pidns := tg.PIDNamespace() pid := pidns.IDOfThreadGroup(tg) // If tg has already been reaped ignore it. if pid == 0 { continue } if containerID != "" && containerID != tg.Leader().ContainerID() { continue } ppid := kernel.ThreadID(0) if p := tg.Leader().Parent(); p != nil { ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) } threads := tg.MemberIDs(pidns) *out = append(*out, &Process{ UID: tg.Leader().Credentials().EffectiveKUID, PID: pid, PPID: ppid, Threads: threads, STime: formatStartTime(now, tg.Leader().StartTime()), C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), Time: tg.CPUStats().SysTime.String(), Cmd: tg.Leader().Name(), TTY: ttyName(tg.TTY()), }) } sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) return nil } // formatStartTime formats startTime depending on the current time: // - If startTime was today, HH:MM is used. // - If startTime was not today but was this year, MonDD is used (e.g. Jan02) // - If startTime was not this year, the year is used. func formatStartTime(now, startTime ktime.Time) string { nowS, nowNs := now.Unix() n := time.Unix(nowS, nowNs) startTimeS, startTimeNs := startTime.Unix() st := time.Unix(startTimeS, startTimeNs) format := "15:04" if st.YearDay() != n.YearDay() { format = "Jan02" } if st.Year() != n.Year() { format = "2006" } return st.Format(format) } func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { // Note: In procps, there is an option to include child CPU stats. As // it is disabled by default, we do not include them. total := stats.UserTime + stats.SysTime lifetime := now.Sub(startTime) if lifetime <= 0 { return 0 } percentCPU := total * 100 / lifetime // Cap at 99% since procps does the same. if percentCPU > 99 { percentCPU = 99 } return int32(percentCPU) } func ttyName(tty *kernel.TTY) string { if tty == nil { return "?" } return fmt.Sprintf("pts/%d", tty.Index) } // getExecutableFD resolves the given executable name and returns a // vfs.FileDescription for the executable file. func getExecutableFD(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, mns *vfs.MountNamespace, envv []string, wd, name string) (*vfs.FileDescription, error) { path, err := user.ResolveExecutablePathVFS2(ctx, creds, mns, envv, wd, name) if err != nil { return nil, err } root := vfs.RootFromContext(ctx) defer root.DecRef() pop := vfs.PathOperation{ Root: root, Start: root, // binPath is absolute, Start can be anything. Path: fspath.Parse(path), FollowFinalSymlink: true, } opts := &vfs.OpenOptions{ Flags: linux.O_RDONLY, FileExec: true, } f, err := vfsObj.OpenAt(ctx, creds, &pop, opts) if err == syserror.ENOENT || err == syserror.EACCES { return nil, nil } return f, err }