// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package shim implements Containerd Shim v2 interface. package shim import ( "context" "fmt" "io" "os" "os/exec" "path/filepath" "strings" "sync" "time" "github.com/BurntSushi/toml" "github.com/containerd/cgroups" cgroupsstats "github.com/containerd/cgroups/stats/v1" "github.com/containerd/console" "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/types/task" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/namespaces" "github.com/containerd/containerd/pkg/process" "github.com/containerd/containerd/pkg/stdio" "github.com/containerd/containerd/runtime" "github.com/containerd/containerd/runtime/linux/runctypes" "github.com/containerd/containerd/runtime/v2/shim" taskAPI "github.com/containerd/containerd/runtime/v2/task" "github.com/containerd/containerd/sys/reaper" "github.com/containerd/typeurl" "github.com/gogo/protobuf/types" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/shim/proc" "gvisor.dev/gvisor/pkg/shim/runsc" "gvisor.dev/gvisor/pkg/shim/runtimeoptions" "gvisor.dev/gvisor/pkg/shim/utils" "gvisor.dev/gvisor/runsc/specutils" ) var ( empty = &types.Empty{} bufPool = sync.Pool{ New: func() interface{} { buffer := make([]byte, 32<<10) return &buffer }, } ) var _ = (taskAPI.TaskService)(&service{}) const ( // configFile is the default config file name. For containerd 1.2, // we assume that a config.toml should exist in the runtime root. configFile = "config.toml" // shimAddressPath is the relative path to a file that contains the address // to the shim UDS. See service.shimAddress. shimAddressPath = "address" cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" ) // New returns a new shim service that can be used via GRPC. func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) { var opts shim.Opts if ctxOpts := ctx.Value(shim.OptsKey{}); ctxOpts != nil { opts = ctxOpts.(shim.Opts) } ep, err := newOOMEpoller(publisher) if err != nil { return nil, err } go ep.run(ctx) s := &service{ id: id, processes: make(map[string]process.Process), events: make(chan interface{}, 128), ec: proc.ExitCh, oomPoller: ep, cancel: cancel, genericOptions: opts, } go s.processExits(ctx) runsc.Monitor = &runsc.LogMonitor{Next: reaper.Default} if err := s.initPlatform(); err != nil { cancel() return nil, fmt.Errorf("failed to initialized platform behavior: %w", err) } go s.forward(ctx, publisher) if address, err := shim.ReadAddress(shimAddressPath); err == nil { s.shimAddress = address } return s, nil } // service is the shim implementation of a remote shim over GRPC. It runs in 2 // different modes: // 1. Service: process runs for the life time of the container and receives // calls described in shimapi.TaskService interface. // 2. Tool: process is short lived and runs only to perform the requested // operations and then exits. It implements the direct functions in // shim.Shim interface. // // When the service is running, it saves a json file with state information so // that commands sent to the tool can load the state and perform the operation. type service struct { mu sync.Mutex // id is the container ID. id string // bundle is a path provided by the caller on container creation. Store // because it's needed in commands that don't receive bundle in the request. bundle string // task is the main process that is running the container. task *proc.Init // processes maps ExecId to processes running through exec. processes map[string]process.Process events chan interface{} // platform handles operations related to the console. platform stdio.Platform // genericOptions are options that come from the shim interface and are common // to all shims. genericOptions shim.Opts // opts are configuration options specific for this shim. opts options // ex gets notified whenever the container init process or an exec'd process // exits from inside the sandbox. ec chan proc.Exit // oomPoller monitors the sandbox's cgroup for OOM notifications. oomPoller *epoller // cancel is a function that needs to be called before the shim stops. The // function is provided by the caller to New(). cancel func() // shimAddress is the location of the UDS used to communicate to containerd. shimAddress string } func (s *service) newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) { ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, err } self, err := os.Executable() if err != nil { return nil, err } cwd, err := os.Getwd() if err != nil { return nil, err } args := []string{ "-namespace", ns, "-address", containerdAddress, "-publish-binary", containerdBinary, } if s.genericOptions.Debug { args = append(args, "-debug") } cmd := exec.Command(self, args...) cmd.Dir = cwd cmd.Env = append(os.Environ(), "GOMAXPROCS=2") cmd.SysProcAttr = &unix.SysProcAttr{ Setpgid: true, } return cmd, nil } func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (string, error) { log.L.Debugf("StartShim, id: %s, binary: %q, address: %q", id, containerdBinary, containerdAddress) cmd, err := s.newCommand(ctx, containerdBinary, containerdAddress) if err != nil { return "", err } address, err := shim.SocketAddress(ctx, containerdAddress, id) if err != nil { return "", err } socket, err := shim.NewSocket(address) if err != nil { // The only time where this would happen is if there is a bug and the socket // was not cleaned up in the cleanup method of the shim or we are using the // grouping functionality where the new process should be run with the same // shim as an existing container. if !shim.SocketEaddrinuse(err) { return "", fmt.Errorf("create new shim socket: %w", err) } if shim.CanConnect(address) { if err := shim.WriteAddress(shimAddressPath, address); err != nil { return "", fmt.Errorf("write existing socket for shim: %w", err) } return address, nil } if err := shim.RemoveSocket(address); err != nil { return "", fmt.Errorf("remove pre-existing socket: %w", err) } if socket, err = shim.NewSocket(address); err != nil { return "", fmt.Errorf("try create new shim socket 2x: %w", err) } } cu := cleanup.Make(func() { socket.Close() _ = shim.RemoveSocket(address) }) defer cu.Clean() f, err := socket.File() if err != nil { return "", err } cmd.ExtraFiles = append(cmd.ExtraFiles, f) log.L.Debugf("Executing: %q %s", cmd.Path, cmd.Args) if err := cmd.Start(); err != nil { f.Close() return "", err } cu.Add(func() { cmd.Process.Kill() }) // make sure to wait after start go cmd.Wait() if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil { return "", err } if err := shim.WriteAddress(shimAddressPath, address); err != nil { return "", err } if err := shim.SetScore(cmd.Process.Pid); err != nil { return "", fmt.Errorf("failed to set OOM Score on shim: %w", err) } cu.Release() return address, nil } // Cleanup is called from another process (need to reload state) to stop the // container and undo all operations done in Create(). func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) { log.L.Debugf("Cleanup") path, err := os.Getwd() if err != nil { return nil, err } ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, err } var st state if err := st.load(path); err != nil { return nil, err } r := proc.NewRunsc(s.opts.Root, path, ns, st.Options.BinaryName, nil) if err := r.Delete(ctx, s.id, &runsc.DeleteOpts{ Force: true, }); err != nil { log.L.Infof("failed to remove runc container: %v", err) } if err := mount.UnmountAll(st.Rootfs, 0); err != nil { log.L.Infof("failed to cleanup rootfs mount: %v", err) } return &taskAPI.DeleteResponse{ ExitedAt: time.Now(), ExitStatus: 128 + uint32(unix.SIGKILL), }, nil } // Create creates a new initial process and container with the underlying OCI // runtime. func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) { s.mu.Lock() defer s.mu.Unlock() // Save the main task id and bundle to the shim for additional requests. s.id = r.ID s.bundle = r.Bundle ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, fmt.Errorf("create namespace: %w", err) } // Read from root for now. if r.Options != nil { v, err := typeurl.UnmarshalAny(r.Options) if err != nil { return nil, err } var path string switch o := v.(type) { case *runctypes.CreateOptions: // containerd 1.2.x s.opts.IoUID = o.IoUid s.opts.IoGID = o.IoGid s.opts.ShimCgroup = o.ShimCgroup case *runctypes.RuncOptions: // containerd 1.2.x root := proc.RunscRoot if o.RuntimeRoot != "" { root = o.RuntimeRoot } s.opts.BinaryName = o.Runtime path = filepath.Join(root, configFile) if _, err := os.Stat(path); err != nil { if !os.IsNotExist(err) { return nil, fmt.Errorf("stat config file %q: %w", path, err) } // A config file in runtime root is not required. path = "" } case *runtimeoptions.Options: // containerd 1.3.x+ if o.ConfigPath == "" { break } if o.TypeUrl != optionsType { return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl) } path = o.ConfigPath default: return nil, fmt.Errorf("unsupported option type %q", r.Options.TypeUrl) } if path != "" { if _, err = toml.DecodeFile(path, &s.opts); err != nil { return nil, fmt.Errorf("decode config file %q: %w", path, err) } } } if len(s.opts.LogLevel) != 0 { lvl, err := logrus.ParseLevel(s.opts.LogLevel) if err != nil { return nil, err } logrus.SetLevel(lvl) } if len(s.opts.LogPath) != 0 { logPath := runsc.FormatShimLogPath(s.opts.LogPath, s.id) if err := os.MkdirAll(filepath.Dir(logPath), 0777); err != nil { return nil, fmt.Errorf("failed to create log dir: %w", err) } logFile, err := os.Create(logPath) if err != nil { return nil, fmt.Errorf("failed to create log file: %w", err) } log.L.Debugf("Starting mirror log at %q", logPath) std := logrus.StandardLogger() std.SetOutput(io.MultiWriter(std.Out, logFile)) log.L.Debugf("Create shim") log.L.Debugf("***************************") log.L.Debugf("Args: %s", os.Args) log.L.Debugf("PID: %d", os.Getpid()) log.L.Debugf("ID: %s", s.id) log.L.Debugf("Options: %+v", s.opts) log.L.Debugf("Bundle: %s", r.Bundle) log.L.Debugf("Terminal: %t", r.Terminal) log.L.Debugf("stdin: %s", r.Stdin) log.L.Debugf("stdout: %s", r.Stdout) log.L.Debugf("stderr: %s", r.Stderr) log.L.Debugf("***************************") if log.L.Logger.IsLevelEnabled(logrus.DebugLevel) { setDebugSigHandler() } } // Save state before any action is taken to ensure Cleanup() will have all // the information it needs to undo the operations. st := state{ Rootfs: filepath.Join(r.Bundle, "rootfs"), Options: s.opts, } if err := st.save(r.Bundle); err != nil { return nil, err } if err := os.Mkdir(st.Rootfs, 0711); err != nil && !os.IsExist(err) { return nil, err } // Convert from types.Mount to proc.Mount. var mounts []proc.Mount for _, m := range r.Rootfs { mounts = append(mounts, proc.Mount{ Type: m.Type, Source: m.Source, Target: m.Target, Options: m.Options, }) } // Cleans up all mounts in case of failure. cu := cleanup.Make(func() { if err := mount.UnmountAll(st.Rootfs, 0); err != nil { log.L.Infof("failed to cleanup rootfs mount: %v", err) } }) defer cu.Clean() for _, rm := range mounts { m := &mount.Mount{ Type: rm.Type, Source: rm.Source, Options: rm.Options, } if err := m.Mount(st.Rootfs); err != nil { return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err) } } config := &proc.CreateConfig{ ID: r.ID, Bundle: r.Bundle, Runtime: s.opts.BinaryName, Rootfs: mounts, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, } process, err := newInit(r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &s.opts, st.Rootfs) if err != nil { return nil, utils.ErrToGRPC(err) } if err := process.Create(ctx, config); err != nil { return nil, utils.ErrToGRPC(err) } // Set up OOM notification on the sandbox's cgroup. This is done on // sandbox create since the sandbox process will be created here. pid := process.Pid() if pid > 0 { cg, err := cgroups.Load(cgroups.V1, cgroups.PidPath(pid)) if err != nil { return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err) } if err := s.oomPoller.add(s.id, cg); err != nil { return nil, fmt.Errorf("add cg to OOM monitor: %w", err) } } // Success cu.Release() s.task = process return &taskAPI.CreateTaskResponse{ Pid: uint32(process.Pid()), }, nil } // Start starts a process. func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) { log.L.Debugf("Start, id: %s, execID: %s", r.ID, r.ExecID) p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Start(ctx); err != nil { return nil, err } // TODO: Set the cgroup and oom notifications on restore. // https://github.com/google/gvisor-containerd-shim/issues/58 return &taskAPI.StartResponse{ Pid: uint32(p.Pid()), }, nil } // Delete deletes the initial process and container. func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { log.L.Debugf("Delete, id: %s, execID: %s", r.ID, r.ExecID) p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Delete(ctx); err != nil { return nil, err } if len(r.ExecID) != 0 { s.mu.Lock() delete(s.processes, r.ExecID) s.mu.Unlock() } else if s.platform != nil { s.platform.Close() } return &taskAPI.DeleteResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), Pid: uint32(p.Pid()), }, nil } // Exec spawns an additional process inside the container. func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) { log.L.Debugf("Exec, id: %s, execID: %s", r.ID, r.ExecID) s.mu.Lock() p := s.processes[r.ExecID] s.mu.Unlock() if p != nil { return nil, utils.ErrToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) } if s.task == nil { return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } process, err := s.task.Exec(ctx, s.bundle, &proc.ExecConfig{ ID: r.ExecID, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Spec: r.Spec, }) if err != nil { return nil, utils.ErrToGRPC(err) } s.mu.Lock() s.processes[r.ExecID] = process s.mu.Unlock() return empty, nil } // ResizePty resizes the terminal of a process. func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) { log.L.Debugf("ResizePty, id: %s, execID: %s, dimension: %dx%d", r.ID, r.ExecID, r.Height, r.Width) p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } ws := console.WinSize{ Width: uint16(r.Width), Height: uint16(r.Height), } if err := p.Resize(ws); err != nil { return nil, utils.ErrToGRPC(err) } return empty, nil } // State returns runtime state information for a process. func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) { log.L.Debugf("State, id: %s, execID: %s", r.ID, r.ExecID) p, err := s.getProcess(r.ExecID) if err != nil { log.L.Debugf("State failed to find process: %v", err) return nil, err } st, err := p.Status(ctx) if err != nil { log.L.Debugf("State failed: %v", err) return nil, err } status := task.StatusUnknown switch st { case "created": status = task.StatusCreated case "running": status = task.StatusRunning case "stopped": status = task.StatusStopped } sio := p.Stdio() res := &taskAPI.StateResponse{ ID: p.ID(), Bundle: s.bundle, Pid: uint32(p.Pid()), Status: status, Stdin: sio.Stdin, Stdout: sio.Stdout, Stderr: sio.Stderr, Terminal: sio.Terminal, ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), } log.L.Debugf("State succeeded, response: %+v", res) return res, nil } // Pause the container. func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) { log.L.Debugf("Pause, id: %s", r.ID) if s.task == nil { log.L.Debugf("Pause error, id: %s: container not created", r.ID) return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Pause(ctx, r.ID) if err != nil { return nil, err } return empty, nil } // Resume the container. func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) { log.L.Debugf("Resume, id: %s", r.ID) if s.task == nil { log.L.Debugf("Resume error, id: %s: container not created", r.ID) return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } err := s.task.Runtime().Resume(ctx, r.ID) if err != nil { return nil, err } return empty, nil } // Kill a process with the provided signal. func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) { log.L.Debugf("Kill, id: %s, execID: %s, signal: %d, all: %t", r.ID, r.ExecID, r.Signal, r.All) p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Kill(ctx, r.Signal, r.All); err != nil { log.L.Debugf("Kill failed: %v", err) return nil, utils.ErrToGRPC(err) } log.L.Debugf("Kill succeeded") return empty, nil } // Pids returns all pids inside the container. func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) { log.L.Debugf("Pids, id: %s", r.ID) pids, err := s.getContainerPids(ctx, r.ID) if err != nil { return nil, utils.ErrToGRPC(err) } var processes []*task.ProcessInfo for _, pid := range pids { pInfo := task.ProcessInfo{ Pid: pid, } for _, p := range s.processes { if p.Pid() == int(pid) { d := &runctypes.ProcessDetails{ ExecID: p.ID(), } a, err := typeurl.MarshalAny(d) if err != nil { return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err) } pInfo.Info = a break } } processes = append(processes, &pInfo) } return &taskAPI.PidsResponse{ Processes: processes, }, nil } // CloseIO closes the I/O context of a process. func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) { log.L.Debugf("CloseIO, id: %s, execID: %s, stdin: %t", r.ID, r.ExecID, r.Stdin) p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if stdin := p.Stdin(); stdin != nil { if err := stdin.Close(); err != nil { return nil, fmt.Errorf("close stdin: %w", err) } } return empty, nil } // Checkpoint checkpoints the container. func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) { log.L.Debugf("Checkpoint, id: %s", r.ID) return empty, utils.ErrToGRPC(errdefs.ErrNotImplemented) } // Connect returns shim information such as the shim's pid. func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) { log.L.Debugf("Connect, id: %s", r.ID) var pid int if s.task != nil { pid = s.task.Pid() } return &taskAPI.ConnectResponse{ ShimPid: uint32(os.Getpid()), TaskPid: uint32(pid), }, nil } func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) { log.L.Debugf("Shutdown, id: %s", r.ID) s.cancel() if s.shimAddress != "" { _ = shim.RemoveSocket(s.shimAddress) } os.Exit(0) panic("Should not get here") } func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) { log.L.Debugf("Stats, id: %s", r.ID) if s.task == nil { log.L.Debugf("Stats error, id: %s: container not created", r.ID) return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } stats, err := s.task.Stats(ctx, s.id) if err != nil { log.L.Debugf("Stats error, id: %s: %v", r.ID, err) return nil, err } // gvisor currently (as of 2020-03-03) only returns the total memory // usage and current PID value[0]. However, we copy the common fields here // so that future updates will propagate correct information. We're // using the cgroups.Metrics structure so we're returning the same type // as runc. // // [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81 metrics := &cgroupsstats.Metrics{ CPU: &cgroupsstats.CPUStat{ Usage: &cgroupsstats.CPUUsage{ Total: stats.Cpu.Usage.Total, Kernel: stats.Cpu.Usage.Kernel, User: stats.Cpu.Usage.User, PerCPU: stats.Cpu.Usage.Percpu, }, Throttling: &cgroupsstats.Throttle{ Periods: stats.Cpu.Throttling.Periods, ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods, ThrottledTime: stats.Cpu.Throttling.ThrottledTime, }, }, Memory: &cgroupsstats.MemoryStat{ Cache: stats.Memory.Cache, Usage: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Usage.Limit, Usage: stats.Memory.Usage.Usage, Max: stats.Memory.Usage.Max, Failcnt: stats.Memory.Usage.Failcnt, }, Swap: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Swap.Limit, Usage: stats.Memory.Swap.Usage, Max: stats.Memory.Swap.Max, Failcnt: stats.Memory.Swap.Failcnt, }, Kernel: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.Kernel.Limit, Usage: stats.Memory.Kernel.Usage, Max: stats.Memory.Kernel.Max, Failcnt: stats.Memory.Kernel.Failcnt, }, KernelTCP: &cgroupsstats.MemoryEntry{ Limit: stats.Memory.KernelTCP.Limit, Usage: stats.Memory.KernelTCP.Usage, Max: stats.Memory.KernelTCP.Max, Failcnt: stats.Memory.KernelTCP.Failcnt, }, }, Pids: &cgroupsstats.PidsStat{ Current: stats.Pids.Current, Limit: stats.Pids.Limit, }, } data, err := typeurl.MarshalAny(metrics) if err != nil { log.L.Debugf("Stats error, id: %s: %v", r.ID, err) return nil, err } log.L.Debugf("Stats success, id: %s: %+v", r.ID, data) return &taskAPI.StatsResponse{ Stats: data, }, nil } // Update updates a running container. func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) { return empty, utils.ErrToGRPC(errdefs.ErrNotImplemented) } // Wait waits for a process to exit. func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) { log.L.Debugf("Wait, id: %s, execID: %s", r.ID, r.ExecID) p, err := s.getProcess(r.ExecID) if err != nil { log.L.Debugf("Wait failed to find process: %v", err) return nil, err } p.Wait() res := &taskAPI.WaitResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), } log.L.Debugf("Wait succeeded, response: %+v", res) return res, nil } func (s *service) processExits(ctx context.Context) { for e := range s.ec { s.checkProcesses(ctx, e) } } func (s *service) checkProcesses(ctx context.Context, e proc.Exit) { // TODO(random-liu): Add `shouldKillAll` logic if container pid // namespace is supported. for _, p := range s.allProcesses() { if p.ID() == e.ID { if ip, ok := p.(*proc.Init); ok { // Ensure all children are killed. log.L.Debugf("Container init process exited, killing all container processes") ip.KillAll(ctx) } p.SetExited(e.Status) s.events <- &events.TaskExit{ ContainerID: s.id, ID: p.ID(), Pid: uint32(p.Pid()), ExitStatus: uint32(e.Status), ExitedAt: p.ExitedAt(), } return } } } func (s *service) allProcesses() (o []process.Process) { s.mu.Lock() defer s.mu.Unlock() for _, p := range s.processes { o = append(o, p) } if s.task != nil { o = append(o, s.task) } return o } func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, fmt.Errorf("container must be created: %w", errdefs.ErrFailedPrecondition) } ps, err := p.Runtime().Ps(ctx, id) if err != nil { return nil, err } pids := make([]uint32, 0, len(ps)) for _, pid := range ps { pids = append(pids, uint32(pid)) } return pids, nil } func (s *service) forward(ctx context.Context, publisher shim.Publisher) { for e := range s.events { err := publisher.Publish(ctx, getTopic(e), e) if err != nil { // Should not happen. panic(fmt.Errorf("post event: %w", err)) } } } func (s *service) getProcess(execID string) (process.Process, error) { s.mu.Lock() defer s.mu.Unlock() if execID == "" { if s.task == nil { return nil, utils.ErrToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } return s.task, nil } p := s.processes[execID] if p == nil { return nil, utils.ErrToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) } return p, nil } func getTopic(e interface{}) string { switch e.(type) { case *events.TaskCreate: return runtime.TaskCreateEventTopic case *events.TaskStart: return runtime.TaskStartEventTopic case *events.TaskOOM: return runtime.TaskOOMEventTopic case *events.TaskExit: return runtime.TaskExitEventTopic case *events.TaskDelete: return runtime.TaskDeleteEventTopic case *events.TaskExecAdded: return runtime.TaskExecAddedEventTopic case *events.TaskExecStarted: return runtime.TaskExecStartedEventTopic default: log.L.Infof("no topic for type %#v", e) } return runtime.TaskUnknownTopic } func newInit(path, workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *options, rootfs string) (*proc.Init, error) { spec, err := utils.ReadSpec(r.Bundle) if err != nil { return nil, fmt.Errorf("read oci spec: %w", err) } updated, err := utils.UpdateVolumeAnnotations(spec) if err != nil { return nil, fmt.Errorf("update volume annotations: %w", err) } updated = setPodCgroup(spec) || updated if updated { if err := utils.WriteSpec(r.Bundle, spec); err != nil { return nil, err } } runsc.FormatRunscLogPath(r.ID, options.RunscConfig) runtime := proc.NewRunsc(options.Root, path, namespace, options.BinaryName, options.RunscConfig) p := proc.New(r.ID, runtime, stdio.Stdio{ Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Terminal: r.Terminal, }) p.Bundle = r.Bundle p.Platform = platform p.Rootfs = rootfs p.WorkDir = workDir p.IoUID = int(options.IoUID) p.IoGID = int(options.IoGID) p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox p.UserLog = utils.UserLogPath(spec) p.Monitor = reaper.Default return p, nil } // setPodCgroup searches for the pod cgroup path inside the container's cgroup // path. If found, it's set as an annotation in the spec. This is done so that // the sandbox joins the pod cgroup. Otherwise, the sandbox would join the pause // container cgroup. Returns true if the spec was modified. Ex.: // /kubepods/burstable/pod123/container123 => kubepods/burstable/pod123 // func setPodCgroup(spec *specs.Spec) bool { if !utils.IsSandbox(spec) { return false } if spec.Linux == nil || len(spec.Linux.CgroupsPath) == 0 { return false } // Search backwards for the pod cgroup path to make the sandbox use it, // instead of the pause container's cgroup. parts := strings.Split(spec.Linux.CgroupsPath, string(filepath.Separator)) for i := len(parts) - 1; i >= 0; i-- { if strings.HasPrefix(parts[i], "pod") { var path string for j := 0; j <= i; j++ { path = filepath.Join(path, parts[j]) } // Add back the initial '/' that may have been lost above. if filepath.IsAbs(spec.Linux.CgroupsPath) { path = string(filepath.Separator) + path } if spec.Linux.CgroupsPath == path { return false } if spec.Annotations == nil { spec.Annotations = make(map[string]string) } spec.Annotations[cgroupParentAnnotation] = path return true } } return false }