summaryrefslogtreecommitdiffhomepage
path: root/pkg/shim
diff options
context:
space:
mode:
authorIan Lewis <ianmlewis@gmail.com>2020-08-17 21:44:31 -0400
committerIan Lewis <ianmlewis@gmail.com>2020-08-17 21:44:31 -0400
commitac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/shim
parent352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)
Merge branch 'master' into ip-forwarding
- Merges aleksej-paschenko's with HEAD - Adds vfs2 support for ip_forward
Diffstat (limited to 'pkg/shim')
-rw-r--r--pkg/shim/runsc/BUILD16
-rw-r--r--pkg/shim/runsc/runsc.go514
-rw-r--r--pkg/shim/runsc/utils.go44
-rw-r--r--pkg/shim/v1/proc/BUILD36
-rw-r--r--pkg/shim/v1/proc/deleted_state.go49
-rw-r--r--pkg/shim/v1/proc/exec.go281
-rw-r--r--pkg/shim/v1/proc/exec_state.go154
-rw-r--r--pkg/shim/v1/proc/init.go460
-rw-r--r--pkg/shim/v1/proc/init_state.go182
-rw-r--r--pkg/shim/v1/proc/io.go162
-rw-r--r--pkg/shim/v1/proc/process.go37
-rw-r--r--pkg/shim/v1/proc/types.go69
-rw-r--r--pkg/shim/v1/proc/utils.go90
-rw-r--r--pkg/shim/v1/shim/BUILD40
-rw-r--r--pkg/shim/v1/shim/api.go28
-rw-r--r--pkg/shim/v1/shim/platform.go106
-rw-r--r--pkg/shim/v1/shim/service.go573
-rw-r--r--pkg/shim/v1/utils/BUILD27
-rw-r--r--pkg/shim/v1/utils/annotations.go25
-rw-r--r--pkg/shim/v1/utils/utils.go56
-rw-r--r--pkg/shim/v1/utils/volumes.go155
-rw-r--r--pkg/shim/v1/utils/volumes_test.go308
-rw-r--r--pkg/shim/v2/BUILD43
-rw-r--r--pkg/shim/v2/api.go22
-rw-r--r--pkg/shim/v2/epoll.go129
-rw-r--r--pkg/shim/v2/options/BUILD11
-rw-r--r--pkg/shim/v2/options/options.go33
-rw-r--r--pkg/shim/v2/runtimeoptions/BUILD20
-rw-r--r--pkg/shim/v2/runtimeoptions/runtimeoptions.go27
-rw-r--r--pkg/shim/v2/runtimeoptions/runtimeoptions.proto25
-rw-r--r--pkg/shim/v2/service.go824
-rw-r--r--pkg/shim/v2/service_linux.go108
32 files changed, 4654 insertions, 0 deletions
diff --git a/pkg/shim/runsc/BUILD b/pkg/shim/runsc/BUILD
new file mode 100644
index 000000000..f08599ebd
--- /dev/null
+++ b/pkg/shim/runsc/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "runsc",
+ srcs = [
+ "runsc.go",
+ "utils.go",
+ ],
+ visibility = ["//:sandbox"],
+ deps = [
+ "@com_github_containerd_go_runc//:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+ ],
+)
diff --git a/pkg/shim/runsc/runsc.go b/pkg/shim/runsc/runsc.go
new file mode 100644
index 000000000..c5cf68efa
--- /dev/null
+++ b/pkg/shim/runsc/runsc.go
@@ -0,0 +1,514 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runsc
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "syscall"
+ "time"
+
+ runc "github.com/containerd/go-runc"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+var Monitor runc.ProcessMonitor = runc.Monitor
+
+// DefaultCommand is the default command for Runsc.
+const DefaultCommand = "runsc"
+
+// Runsc is the client to the runsc cli.
+type Runsc struct {
+ Command string
+ PdeathSignal syscall.Signal
+ Setpgid bool
+ Root string
+ Log string
+ LogFormat runc.Format
+ Config map[string]string
+}
+
+// List returns all containers created inside the provided runsc root directory.
+func (r *Runsc) List(context context.Context) ([]*runc.Container, error) {
+ data, err := cmdOutput(r.command(context, "list", "--format=json"), false)
+ if err != nil {
+ return nil, err
+ }
+ var out []*runc.Container
+ if err := json.Unmarshal(data, &out); err != nil {
+ return nil, err
+ }
+ return out, nil
+}
+
+// State returns the state for the container provided by id.
+func (r *Runsc) State(context context.Context, id string) (*runc.Container, error) {
+ data, err := cmdOutput(r.command(context, "state", id), true)
+ if err != nil {
+ return nil, fmt.Errorf("%s: %s", err, data)
+ }
+ var c runc.Container
+ if err := json.Unmarshal(data, &c); err != nil {
+ return nil, err
+ }
+ return &c, nil
+}
+
+type CreateOpts struct {
+ runc.IO
+ ConsoleSocket runc.ConsoleSocket
+
+ // PidFile is a path to where a pid file should be created.
+ PidFile string
+
+ // UserLog is a path to where runsc user log should be generated.
+ UserLog string
+}
+
+func (o *CreateOpts) args() (out []string, err error) {
+ if o.PidFile != "" {
+ abs, err := filepath.Abs(o.PidFile)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, "--pid-file", abs)
+ }
+ if o.ConsoleSocket != nil {
+ out = append(out, "--console-socket", o.ConsoleSocket.Path())
+ }
+ if o.UserLog != "" {
+ out = append(out, "--user-log", o.UserLog)
+ }
+ return out, nil
+}
+
+// Create creates a new container and returns its pid if it was created successfully.
+func (r *Runsc) Create(context context.Context, id, bundle string, opts *CreateOpts) error {
+ args := []string{"create", "--bundle", bundle}
+ if opts != nil {
+ oargs, err := opts.args()
+ if err != nil {
+ return err
+ }
+ args = append(args, oargs...)
+ }
+ cmd := r.command(context, append(args, id)...)
+ if opts != nil && opts.IO != nil {
+ opts.Set(cmd)
+ }
+
+ if cmd.Stdout == nil && cmd.Stderr == nil {
+ data, err := cmdOutput(cmd, true)
+ if err != nil {
+ return fmt.Errorf("%s: %s", err, data)
+ }
+ return nil
+ }
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return err
+ }
+ if opts != nil && opts.IO != nil {
+ if c, ok := opts.IO.(runc.StartCloser); ok {
+ if err := c.CloseAfterStart(); err != nil {
+ return err
+ }
+ }
+ }
+ status, err := Monitor.Wait(cmd, ec)
+ if err == nil && status != 0 {
+ err = fmt.Errorf("%s did not terminate sucessfully", cmd.Args[0])
+ }
+
+ return err
+}
+
+// Start will start an already created container.
+func (r *Runsc) Start(context context.Context, id string, cio runc.IO) error {
+ cmd := r.command(context, "start", id)
+ if cio != nil {
+ cio.Set(cmd)
+ }
+
+ if cmd.Stdout == nil && cmd.Stderr == nil {
+ data, err := cmdOutput(cmd, true)
+ if err != nil {
+ return fmt.Errorf("%s: %s", err, data)
+ }
+ return nil
+ }
+
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return err
+ }
+ if cio != nil {
+ if c, ok := cio.(runc.StartCloser); ok {
+ if err := c.CloseAfterStart(); err != nil {
+ return err
+ }
+ }
+ }
+ status, err := Monitor.Wait(cmd, ec)
+ if err == nil && status != 0 {
+ err = fmt.Errorf("%s did not terminate sucessfully", cmd.Args[0])
+ }
+
+ return err
+}
+
+type waitResult struct {
+ ID string `json:"id"`
+ ExitStatus int `json:"exitStatus"`
+}
+
+// Wait will wait for a running container, and return its exit status.
+//
+// TODO(random-liu): Add exec process support.
+func (r *Runsc) Wait(context context.Context, id string) (int, error) {
+ data, err := cmdOutput(r.command(context, "wait", id), true)
+ if err != nil {
+ return 0, fmt.Errorf("%s: %s", err, data)
+ }
+ var res waitResult
+ if err := json.Unmarshal(data, &res); err != nil {
+ return 0, err
+ }
+ return res.ExitStatus, nil
+}
+
+type ExecOpts struct {
+ runc.IO
+ PidFile string
+ InternalPidFile string
+ ConsoleSocket runc.ConsoleSocket
+ Detach bool
+}
+
+func (o *ExecOpts) args() (out []string, err error) {
+ if o.ConsoleSocket != nil {
+ out = append(out, "--console-socket", o.ConsoleSocket.Path())
+ }
+ if o.Detach {
+ out = append(out, "--detach")
+ }
+ if o.PidFile != "" {
+ abs, err := filepath.Abs(o.PidFile)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, "--pid-file", abs)
+ }
+ if o.InternalPidFile != "" {
+ abs, err := filepath.Abs(o.InternalPidFile)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, "--internal-pid-file", abs)
+ }
+ return out, nil
+}
+
+// Exec executes an additional process inside the container based on a full OCI
+// Process specification.
+func (r *Runsc) Exec(context context.Context, id string, spec specs.Process, opts *ExecOpts) error {
+ f, err := ioutil.TempFile(os.Getenv("XDG_RUNTIME_DIR"), "runsc-process")
+ if err != nil {
+ return err
+ }
+ defer os.Remove(f.Name())
+ err = json.NewEncoder(f).Encode(spec)
+ f.Close()
+ if err != nil {
+ return err
+ }
+ args := []string{"exec", "--process", f.Name()}
+ if opts != nil {
+ oargs, err := opts.args()
+ if err != nil {
+ return err
+ }
+ args = append(args, oargs...)
+ }
+ cmd := r.command(context, append(args, id)...)
+ if opts != nil && opts.IO != nil {
+ opts.Set(cmd)
+ }
+ if cmd.Stdout == nil && cmd.Stderr == nil {
+ data, err := cmdOutput(cmd, true)
+ if err != nil {
+ return fmt.Errorf("%s: %s", err, data)
+ }
+ return nil
+ }
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return err
+ }
+ if opts != nil && opts.IO != nil {
+ if c, ok := opts.IO.(runc.StartCloser); ok {
+ if err := c.CloseAfterStart(); err != nil {
+ return err
+ }
+ }
+ }
+ status, err := Monitor.Wait(cmd, ec)
+ if err == nil && status != 0 {
+ err = fmt.Errorf("%s did not terminate sucessfully", cmd.Args[0])
+ }
+ return err
+}
+
+// Run runs the create, start, delete lifecycle of the container and returns
+// its exit status after it has exited.
+func (r *Runsc) Run(context context.Context, id, bundle string, opts *CreateOpts) (int, error) {
+ args := []string{"run", "--bundle", bundle}
+ if opts != nil {
+ oargs, err := opts.args()
+ if err != nil {
+ return -1, err
+ }
+ args = append(args, oargs...)
+ }
+ cmd := r.command(context, append(args, id)...)
+ if opts != nil && opts.IO != nil {
+ opts.Set(cmd)
+ }
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return -1, err
+ }
+ return Monitor.Wait(cmd, ec)
+}
+
+type DeleteOpts struct {
+ Force bool
+}
+
+func (o *DeleteOpts) args() (out []string) {
+ if o.Force {
+ out = append(out, "--force")
+ }
+ return out
+}
+
+// Delete deletes the container.
+func (r *Runsc) Delete(context context.Context, id string, opts *DeleteOpts) error {
+ args := []string{"delete"}
+ if opts != nil {
+ args = append(args, opts.args()...)
+ }
+ return r.runOrError(r.command(context, append(args, id)...))
+}
+
+// KillOpts specifies options for killing a container and its processes.
+type KillOpts struct {
+ All bool
+ Pid int
+}
+
+func (o *KillOpts) args() (out []string) {
+ if o.All {
+ out = append(out, "--all")
+ }
+ if o.Pid != 0 {
+ out = append(out, "--pid", strconv.Itoa(o.Pid))
+ }
+ return out
+}
+
+// Kill sends the specified signal to the container.
+func (r *Runsc) Kill(context context.Context, id string, sig int, opts *KillOpts) error {
+ args := []string{
+ "kill",
+ }
+ if opts != nil {
+ args = append(args, opts.args()...)
+ }
+ return r.runOrError(r.command(context, append(args, id, strconv.Itoa(sig))...))
+}
+
+// Stats return the stats for a container like cpu, memory, and I/O.
+func (r *Runsc) Stats(context context.Context, id string) (*runc.Stats, error) {
+ cmd := r.command(context, "events", "--stats", id)
+ rd, err := cmd.StdoutPipe()
+ if err != nil {
+ return nil, err
+ }
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return nil, err
+ }
+ defer func() {
+ rd.Close()
+ Monitor.Wait(cmd, ec)
+ }()
+ var e runc.Event
+ if err := json.NewDecoder(rd).Decode(&e); err != nil {
+ return nil, err
+ }
+ return e.Stats, nil
+}
+
+// Events returns an event stream from runsc for a container with stats and OOM notifications.
+func (r *Runsc) Events(context context.Context, id string, interval time.Duration) (chan *runc.Event, error) {
+ cmd := r.command(context, "events", fmt.Sprintf("--interval=%ds", int(interval.Seconds())), id)
+ rd, err := cmd.StdoutPipe()
+ if err != nil {
+ return nil, err
+ }
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ rd.Close()
+ return nil, err
+ }
+ var (
+ dec = json.NewDecoder(rd)
+ c = make(chan *runc.Event, 128)
+ )
+ go func() {
+ defer func() {
+ close(c)
+ rd.Close()
+ Monitor.Wait(cmd, ec)
+ }()
+ for {
+ var e runc.Event
+ if err := dec.Decode(&e); err != nil {
+ if err == io.EOF {
+ return
+ }
+ e = runc.Event{
+ Type: "error",
+ Err: err,
+ }
+ }
+ c <- &e
+ }
+ }()
+ return c, nil
+}
+
+// Ps lists all the processes inside the container returning their pids.
+func (r *Runsc) Ps(context context.Context, id string) ([]int, error) {
+ data, err := cmdOutput(r.command(context, "ps", "--format", "json", id), true)
+ if err != nil {
+ return nil, fmt.Errorf("%s: %s", err, data)
+ }
+ var pids []int
+ if err := json.Unmarshal(data, &pids); err != nil {
+ return nil, err
+ }
+ return pids, nil
+}
+
+// Top lists all the processes inside the container returning the full ps data.
+func (r *Runsc) Top(context context.Context, id string) (*runc.TopResults, error) {
+ data, err := cmdOutput(r.command(context, "ps", "--format", "table", id), true)
+ if err != nil {
+ return nil, fmt.Errorf("%s: %s", err, data)
+ }
+
+ topResults, err := runc.ParsePSOutput(data)
+ if err != nil {
+ return nil, fmt.Errorf("%s: ", err)
+ }
+ return topResults, nil
+}
+
+func (r *Runsc) args() []string {
+ var args []string
+ if r.Root != "" {
+ args = append(args, fmt.Sprintf("--root=%s", r.Root))
+ }
+ if r.Log != "" {
+ args = append(args, fmt.Sprintf("--log=%s", r.Log))
+ }
+ if r.LogFormat != "" {
+ args = append(args, fmt.Sprintf("--log-format=%s", r.LogFormat))
+ }
+ for k, v := range r.Config {
+ args = append(args, fmt.Sprintf("--%s=%s", k, v))
+ }
+ return args
+}
+
+// runOrError will run the provided command.
+//
+// If an error is encountered and neither Stdout or Stderr was set the error
+// will be returned in the format of <error>: <stderr>.
+func (r *Runsc) runOrError(cmd *exec.Cmd) error {
+ if cmd.Stdout != nil || cmd.Stderr != nil {
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return err
+ }
+ status, err := Monitor.Wait(cmd, ec)
+ if err == nil && status != 0 {
+ err = fmt.Errorf("%s did not terminate sucessfully", cmd.Args[0])
+ }
+ return err
+ }
+ data, err := cmdOutput(cmd, true)
+ if err != nil {
+ return fmt.Errorf("%s: %s", err, data)
+ }
+ return nil
+}
+
+func (r *Runsc) command(context context.Context, args ...string) *exec.Cmd {
+ command := r.Command
+ if command == "" {
+ command = DefaultCommand
+ }
+ cmd := exec.CommandContext(context, command, append(r.args(), args...)...)
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Setpgid: r.Setpgid,
+ }
+ if r.PdeathSignal != 0 {
+ cmd.SysProcAttr.Pdeathsig = r.PdeathSignal
+ }
+
+ return cmd
+}
+
+func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, error) {
+ b := getBuf()
+ defer putBuf(b)
+
+ cmd.Stdout = b
+ if combined {
+ cmd.Stderr = b
+ }
+ ec, err := Monitor.Start(cmd)
+ if err != nil {
+ return nil, err
+ }
+
+ status, err := Monitor.Wait(cmd, ec)
+ if err == nil && status != 0 {
+ err = fmt.Errorf("%s did not terminate sucessfully", cmd.Args[0])
+ }
+
+ return b.Bytes(), err
+}
diff --git a/pkg/shim/runsc/utils.go b/pkg/shim/runsc/utils.go
new file mode 100644
index 000000000..c514b3bc7
--- /dev/null
+++ b/pkg/shim/runsc/utils.go
@@ -0,0 +1,44 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runsc
+
+import (
+ "bytes"
+ "strings"
+ "sync"
+)
+
+var bytesBufferPool = sync.Pool{
+ New: func() interface{} {
+ return bytes.NewBuffer(nil)
+ },
+}
+
+func getBuf() *bytes.Buffer {
+ return bytesBufferPool.Get().(*bytes.Buffer)
+}
+
+func putBuf(b *bytes.Buffer) {
+ b.Reset()
+ bytesBufferPool.Put(b)
+}
+
+// FormatLogPath parses runsc config, and fill in %ID% in the log path.
+func FormatLogPath(id string, config map[string]string) {
+ if path, ok := config["debug-log"]; ok {
+ config["debug-log"] = strings.Replace(path, "%ID%", id, -1)
+ }
+}
diff --git a/pkg/shim/v1/proc/BUILD b/pkg/shim/v1/proc/BUILD
new file mode 100644
index 000000000..4377306af
--- /dev/null
+++ b/pkg/shim/v1/proc/BUILD
@@ -0,0 +1,36 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "proc",
+ srcs = [
+ "deleted_state.go",
+ "exec.go",
+ "exec_state.go",
+ "init.go",
+ "init_state.go",
+ "io.go",
+ "process.go",
+ "types.go",
+ "utils.go",
+ ],
+ visibility = [
+ "//pkg/shim:__subpackages__",
+ "//shim:__subpackages__",
+ ],
+ deps = [
+ "//pkg/shim/runsc",
+ "@com_github_containerd_console//:go_default_library",
+ "@com_github_containerd_containerd//errdefs:go_default_library",
+ "@com_github_containerd_containerd//log:go_default_library",
+ "@com_github_containerd_containerd//mount:go_default_library",
+ "@com_github_containerd_containerd//pkg/process:go_default_library",
+ "@com_github_containerd_containerd//pkg/stdio:go_default_library",
+ "@com_github_containerd_fifo//:go_default_library",
+ "@com_github_containerd_go_runc//:go_default_library",
+ "@com_github_gogo_protobuf//types:go_default_library",
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
diff --git a/pkg/shim/v1/proc/deleted_state.go b/pkg/shim/v1/proc/deleted_state.go
new file mode 100644
index 000000000..d9b970c4d
--- /dev/null
+++ b/pkg/shim/v1/proc/deleted_state.go
@@ -0,0 +1,49 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/containerd/console"
+ "github.com/containerd/containerd/errdefs"
+ "github.com/containerd/containerd/pkg/process"
+)
+
+type deletedState struct{}
+
+func (*deletedState) Resize(ws console.WinSize) error {
+ return fmt.Errorf("cannot resize a deleted process.ss")
+}
+
+func (*deletedState) Start(ctx context.Context) error {
+ return fmt.Errorf("cannot start a deleted process.ss")
+}
+
+func (*deletedState) Delete(ctx context.Context) error {
+ return fmt.Errorf("cannot delete a deleted process.ss: %w", errdefs.ErrNotFound)
+}
+
+func (*deletedState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return fmt.Errorf("cannot kill a deleted process.ss: %w", errdefs.ErrNotFound)
+}
+
+func (*deletedState) SetExited(status int) {}
+
+func (*deletedState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) {
+ return nil, fmt.Errorf("cannot exec in a deleted state")
+}
diff --git a/pkg/shim/v1/proc/exec.go b/pkg/shim/v1/proc/exec.go
new file mode 100644
index 000000000..1d1d90488
--- /dev/null
+++ b/pkg/shim/v1/proc/exec.go
@@ -0,0 +1,281 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "os"
+ "path/filepath"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/containerd/console"
+ "github.com/containerd/containerd/errdefs"
+ "github.com/containerd/containerd/pkg/stdio"
+ "github.com/containerd/fifo"
+ runc "github.com/containerd/go-runc"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "golang.org/x/sys/unix"
+
+ "gvisor.dev/gvisor/pkg/shim/runsc"
+)
+
+type execProcess struct {
+ wg sync.WaitGroup
+
+ execState execState
+
+ mu sync.Mutex
+ id string
+ console console.Console
+ io runc.IO
+ status int
+ exited time.Time
+ pid int
+ internalPid int
+ closers []io.Closer
+ stdin io.Closer
+ stdio stdio.Stdio
+ path string
+ spec specs.Process
+
+ parent *Init
+ waitBlock chan struct{}
+}
+
+func (e *execProcess) Wait() {
+ <-e.waitBlock
+}
+
+func (e *execProcess) ID() string {
+ return e.id
+}
+
+func (e *execProcess) Pid() int {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ return e.pid
+}
+
+func (e *execProcess) ExitStatus() int {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ return e.status
+}
+
+func (e *execProcess) ExitedAt() time.Time {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ return e.exited
+}
+
+func (e *execProcess) SetExited(status int) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ e.execState.SetExited(status)
+}
+
+func (e *execProcess) setExited(status int) {
+ e.status = status
+ e.exited = time.Now()
+ e.parent.Platform.ShutdownConsole(context.Background(), e.console)
+ close(e.waitBlock)
+}
+
+func (e *execProcess) Delete(ctx context.Context) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ return e.execState.Delete(ctx)
+}
+
+func (e *execProcess) delete(ctx context.Context) error {
+ e.wg.Wait()
+ if e.io != nil {
+ for _, c := range e.closers {
+ c.Close()
+ }
+ e.io.Close()
+ }
+ pidfile := filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id))
+ // silently ignore error
+ os.Remove(pidfile)
+ internalPidfile := filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id))
+ // silently ignore error
+ os.Remove(internalPidfile)
+ return nil
+}
+
+func (e *execProcess) Resize(ws console.WinSize) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ return e.execState.Resize(ws)
+}
+
+func (e *execProcess) resize(ws console.WinSize) error {
+ if e.console == nil {
+ return nil
+ }
+ return e.console.Resize(ws)
+}
+
+func (e *execProcess) Kill(ctx context.Context, sig uint32, _ bool) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ return e.execState.Kill(ctx, sig, false)
+}
+
+func (e *execProcess) kill(ctx context.Context, sig uint32, _ bool) error {
+ internalPid := e.internalPid
+ if internalPid != 0 {
+ if err := e.parent.runtime.Kill(ctx, e.parent.id, int(sig), &runsc.KillOpts{
+ Pid: internalPid,
+ }); err != nil {
+ // If this returns error, consider the process has
+ // already stopped.
+ //
+ // TODO: Fix after signal handling is fixed.
+ return fmt.Errorf("%s: %w", err.Error(), errdefs.ErrNotFound)
+ }
+ }
+ return nil
+}
+
+func (e *execProcess) Stdin() io.Closer {
+ return e.stdin
+}
+
+func (e *execProcess) Stdio() stdio.Stdio {
+ return e.stdio
+}
+
+func (e *execProcess) Start(ctx context.Context) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ return e.execState.Start(ctx)
+}
+
+func (e *execProcess) start(ctx context.Context) (err error) {
+ var (
+ socket *runc.Socket
+ pidfile = filepath.Join(e.path, fmt.Sprintf("%s.pid", e.id))
+ internalPidfile = filepath.Join(e.path, fmt.Sprintf("%s-internal.pid", e.id))
+ )
+ if e.stdio.Terminal {
+ if socket, err = runc.NewTempConsoleSocket(); err != nil {
+ return fmt.Errorf("failed to create runc console socket: %w", err)
+ }
+ defer socket.Close()
+ } else if e.stdio.IsNull() {
+ if e.io, err = runc.NewNullIO(); err != nil {
+ return fmt.Errorf("creating new NULL IO: %w", err)
+ }
+ } else {
+ if e.io, err = runc.NewPipeIO(e.parent.IoUID, e.parent.IoGID, withConditionalIO(e.stdio)); err != nil {
+ return fmt.Errorf("failed to create runc io pipes: %w", err)
+ }
+ }
+ opts := &runsc.ExecOpts{
+ PidFile: pidfile,
+ InternalPidFile: internalPidfile,
+ IO: e.io,
+ Detach: true,
+ }
+ if socket != nil {
+ opts.ConsoleSocket = socket
+ }
+ eventCh := e.parent.Monitor.Subscribe()
+ defer func() {
+ // Unsubscribe if an error is returned.
+ if err != nil {
+ e.parent.Monitor.Unsubscribe(eventCh)
+ }
+ }()
+ if err := e.parent.runtime.Exec(ctx, e.parent.id, e.spec, opts); err != nil {
+ close(e.waitBlock)
+ return e.parent.runtimeError(err, "OCI runtime exec failed")
+ }
+ if e.stdio.Stdin != "" {
+ sc, err := fifo.OpenFifo(context.Background(), e.stdio.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0)
+ if err != nil {
+ return fmt.Errorf("failed to open stdin fifo %s: %w", e.stdio.Stdin, err)
+ }
+ e.closers = append(e.closers, sc)
+ e.stdin = sc
+ }
+ ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+ defer cancel()
+ if socket != nil {
+ console, err := socket.ReceiveMaster()
+ if err != nil {
+ return fmt.Errorf("failed to retrieve console master: %w", err)
+ }
+ if e.console, err = e.parent.Platform.CopyConsole(ctx, console, e.stdio.Stdin, e.stdio.Stdout, e.stdio.Stderr, &e.wg); err != nil {
+ return fmt.Errorf("failed to start console copy: %w", err)
+ }
+ } else if !e.stdio.IsNull() {
+ if err := copyPipes(ctx, e.io, e.stdio.Stdin, e.stdio.Stdout, e.stdio.Stderr, &e.wg); err != nil {
+ return fmt.Errorf("failed to start io pipe copy: %w", err)
+ }
+ }
+ pid, err := runc.ReadPidFile(opts.PidFile)
+ if err != nil {
+ return fmt.Errorf("failed to retrieve OCI runtime exec pid: %w", err)
+ }
+ e.pid = pid
+ internalPid, err := runc.ReadPidFile(opts.InternalPidFile)
+ if err != nil {
+ return fmt.Errorf("failed to retrieve OCI runtime exec internal pid: %w", err)
+ }
+ e.internalPid = internalPid
+ go func() {
+ defer e.parent.Monitor.Unsubscribe(eventCh)
+ for event := range eventCh {
+ if event.Pid == e.pid {
+ ExitCh <- Exit{
+ Timestamp: event.Timestamp,
+ ID: e.id,
+ Status: event.Status,
+ }
+ break
+ }
+ }
+ }()
+ return nil
+}
+
+func (e *execProcess) Status(ctx context.Context) (string, error) {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ // if we don't have a pid then the exec process has just been created
+ if e.pid == 0 {
+ return "created", nil
+ }
+ // if we have a pid and it can be signaled, the process is running
+ // TODO(random-liu): Use `runsc kill --pid`.
+ if err := unix.Kill(e.pid, 0); err == nil {
+ return "running", nil
+ }
+ // else if we have a pid but it can nolonger be signaled, it has stopped
+ return "stopped", nil
+}
diff --git a/pkg/shim/v1/proc/exec_state.go b/pkg/shim/v1/proc/exec_state.go
new file mode 100644
index 000000000..4dcda8b44
--- /dev/null
+++ b/pkg/shim/v1/proc/exec_state.go
@@ -0,0 +1,154 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/containerd/console"
+)
+
+type execState interface {
+ Resize(console.WinSize) error
+ Start(context.Context) error
+ Delete(context.Context) error
+ Kill(context.Context, uint32, bool) error
+ SetExited(int)
+}
+
+type execCreatedState struct {
+ p *execProcess
+}
+
+func (s *execCreatedState) transition(name string) error {
+ switch name {
+ case "running":
+ s.p.execState = &execRunningState{p: s.p}
+ case "stopped":
+ s.p.execState = &execStoppedState{p: s.p}
+ case "deleted":
+ s.p.execState = &deletedState{}
+ default:
+ return fmt.Errorf("invalid state transition %q to %q", stateName(s), name)
+ }
+ return nil
+}
+
+func (s *execCreatedState) Resize(ws console.WinSize) error {
+ return s.p.resize(ws)
+}
+
+func (s *execCreatedState) Start(ctx context.Context) error {
+ if err := s.p.start(ctx); err != nil {
+ return err
+ }
+ return s.transition("running")
+}
+
+func (s *execCreatedState) Delete(ctx context.Context) error {
+ if err := s.p.delete(ctx); err != nil {
+ return err
+ }
+ return s.transition("deleted")
+}
+
+func (s *execCreatedState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return s.p.kill(ctx, sig, all)
+}
+
+func (s *execCreatedState) SetExited(status int) {
+ s.p.setExited(status)
+
+ if err := s.transition("stopped"); err != nil {
+ panic(err)
+ }
+}
+
+type execRunningState struct {
+ p *execProcess
+}
+
+func (s *execRunningState) transition(name string) error {
+ switch name {
+ case "stopped":
+ s.p.execState = &execStoppedState{p: s.p}
+ default:
+ return fmt.Errorf("invalid state transition %q to %q", stateName(s), name)
+ }
+ return nil
+}
+
+func (s *execRunningState) Resize(ws console.WinSize) error {
+ return s.p.resize(ws)
+}
+
+func (s *execRunningState) Start(ctx context.Context) error {
+ return fmt.Errorf("cannot start a running process")
+}
+
+func (s *execRunningState) Delete(ctx context.Context) error {
+ return fmt.Errorf("cannot delete a running process")
+}
+
+func (s *execRunningState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return s.p.kill(ctx, sig, all)
+}
+
+func (s *execRunningState) SetExited(status int) {
+ s.p.setExited(status)
+
+ if err := s.transition("stopped"); err != nil {
+ panic(err)
+ }
+}
+
+type execStoppedState struct {
+ p *execProcess
+}
+
+func (s *execStoppedState) transition(name string) error {
+ switch name {
+ case "deleted":
+ s.p.execState = &deletedState{}
+ default:
+ return fmt.Errorf("invalid state transition %q to %q", stateName(s), name)
+ }
+ return nil
+}
+
+func (s *execStoppedState) Resize(ws console.WinSize) error {
+ return fmt.Errorf("cannot resize a stopped container")
+}
+
+func (s *execStoppedState) Start(ctx context.Context) error {
+ return fmt.Errorf("cannot start a stopped process")
+}
+
+func (s *execStoppedState) Delete(ctx context.Context) error {
+ if err := s.p.delete(ctx); err != nil {
+ return err
+ }
+ return s.transition("deleted")
+}
+
+func (s *execStoppedState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return s.p.kill(ctx, sig, all)
+}
+
+func (s *execStoppedState) SetExited(status int) {
+ // no op
+}
diff --git a/pkg/shim/v1/proc/init.go b/pkg/shim/v1/proc/init.go
new file mode 100644
index 000000000..dab3123d6
--- /dev/null
+++ b/pkg/shim/v1/proc/init.go
@@ -0,0 +1,460 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "path/filepath"
+ "strings"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/containerd/console"
+ "github.com/containerd/containerd/errdefs"
+ "github.com/containerd/containerd/log"
+ "github.com/containerd/containerd/mount"
+ "github.com/containerd/containerd/pkg/process"
+ "github.com/containerd/containerd/pkg/stdio"
+ "github.com/containerd/fifo"
+ runc "github.com/containerd/go-runc"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+
+ "gvisor.dev/gvisor/pkg/shim/runsc"
+)
+
+// InitPidFile name of the file that contains the init pid.
+const InitPidFile = "init.pid"
+
+// Init represents an initial process for a container.
+type Init struct {
+ wg sync.WaitGroup
+ initState initState
+
+ // mu is used to ensure that `Start()` and `Exited()` calls return in
+ // the right order when invoked in separate go routines. This is the
+ // case within the shim implementation as it makes use of the reaper
+ // interface.
+ mu sync.Mutex
+
+ waitBlock chan struct{}
+
+ WorkDir string
+
+ id string
+ Bundle string
+ console console.Console
+ Platform stdio.Platform
+ io runc.IO
+ runtime *runsc.Runsc
+ status int
+ exited time.Time
+ pid int
+ closers []io.Closer
+ stdin io.Closer
+ stdio stdio.Stdio
+ Rootfs string
+ IoUID int
+ IoGID int
+ Sandbox bool
+ UserLog string
+ Monitor ProcessMonitor
+}
+
+// NewRunsc returns a new runsc instance for a process.
+func NewRunsc(root, path, namespace, runtime string, config map[string]string) *runsc.Runsc {
+ if root == "" {
+ root = RunscRoot
+ }
+ return &runsc.Runsc{
+ Command: runtime,
+ PdeathSignal: syscall.SIGKILL,
+ Log: filepath.Join(path, "log.json"),
+ LogFormat: runc.JSON,
+ Root: filepath.Join(root, namespace),
+ Config: config,
+ }
+}
+
+// New returns a new init process.
+func New(id string, runtime *runsc.Runsc, stdio stdio.Stdio) *Init {
+ p := &Init{
+ id: id,
+ runtime: runtime,
+ stdio: stdio,
+ status: 0,
+ waitBlock: make(chan struct{}),
+ }
+ p.initState = &createdState{p: p}
+ return p
+}
+
+// Create the process with the provided config.
+func (p *Init) Create(ctx context.Context, r *CreateConfig) (err error) {
+ var socket *runc.Socket
+ if r.Terminal {
+ if socket, err = runc.NewTempConsoleSocket(); err != nil {
+ return fmt.Errorf("failed to create OCI runtime console socket: %w", err)
+ }
+ defer socket.Close()
+ } else if hasNoIO(r) {
+ if p.io, err = runc.NewNullIO(); err != nil {
+ return fmt.Errorf("creating new NULL IO: %w", err)
+ }
+ } else {
+ if p.io, err = runc.NewPipeIO(p.IoUID, p.IoGID, withConditionalIO(p.stdio)); err != nil {
+ return fmt.Errorf("failed to create OCI runtime io pipes: %w", err)
+ }
+ }
+ pidFile := filepath.Join(p.Bundle, InitPidFile)
+ opts := &runsc.CreateOpts{
+ PidFile: pidFile,
+ }
+ if socket != nil {
+ opts.ConsoleSocket = socket
+ }
+ if p.Sandbox {
+ opts.IO = p.io
+ // UserLog is only useful for sandbox.
+ opts.UserLog = p.UserLog
+ }
+ if err := p.runtime.Create(ctx, r.ID, r.Bundle, opts); err != nil {
+ return p.runtimeError(err, "OCI runtime create failed")
+ }
+ if r.Stdin != "" {
+ sc, err := fifo.OpenFifo(context.Background(), r.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0)
+ if err != nil {
+ return fmt.Errorf("failed to open stdin fifo %s: %w", r.Stdin, err)
+ }
+ p.stdin = sc
+ p.closers = append(p.closers, sc)
+ }
+ ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+ defer cancel()
+ if socket != nil {
+ console, err := socket.ReceiveMaster()
+ if err != nil {
+ return fmt.Errorf("failed to retrieve console master: %w", err)
+ }
+ console, err = p.Platform.CopyConsole(ctx, console, r.Stdin, r.Stdout, r.Stderr, &p.wg)
+ if err != nil {
+ return fmt.Errorf("failed to start console copy: %w", err)
+ }
+ p.console = console
+ } else if !hasNoIO(r) {
+ if err := copyPipes(ctx, p.io, r.Stdin, r.Stdout, r.Stderr, &p.wg); err != nil {
+ return fmt.Errorf("failed to start io pipe copy: %w", err)
+ }
+ }
+ pid, err := runc.ReadPidFile(pidFile)
+ if err != nil {
+ return fmt.Errorf("failed to retrieve OCI runtime container pid: %w", err)
+ }
+ p.pid = pid
+ return nil
+}
+
+// Wait waits for the process to exit.
+func (p *Init) Wait() {
+ <-p.waitBlock
+}
+
+// ID returns the ID of the process.
+func (p *Init) ID() string {
+ return p.id
+}
+
+// Pid returns the PID of the process.
+func (p *Init) Pid() int {
+ return p.pid
+}
+
+// ExitStatus returns the exit status of the process.
+func (p *Init) ExitStatus() int {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.status
+}
+
+// ExitedAt returns the time when the process exited.
+func (p *Init) ExitedAt() time.Time {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.exited
+}
+
+// Status returns the status of the process.
+func (p *Init) Status(ctx context.Context) (string, error) {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ c, err := p.runtime.State(ctx, p.id)
+ if err != nil {
+ if strings.Contains(err.Error(), "does not exist") {
+ return "stopped", nil
+ }
+ return "", p.runtimeError(err, "OCI runtime state failed")
+ }
+ return p.convertStatus(c.Status), nil
+}
+
+// Start starts the init process.
+func (p *Init) Start(ctx context.Context) error {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.initState.Start(ctx)
+}
+
+func (p *Init) start(ctx context.Context) error {
+ var cio runc.IO
+ if !p.Sandbox {
+ cio = p.io
+ }
+ if err := p.runtime.Start(ctx, p.id, cio); err != nil {
+ return p.runtimeError(err, "OCI runtime start failed")
+ }
+ go func() {
+ status, err := p.runtime.Wait(context.Background(), p.id)
+ if err != nil {
+ log.G(ctx).WithError(err).Errorf("Failed to wait for container %q", p.id)
+ // TODO(random-liu): Handle runsc kill error.
+ if err := p.killAll(ctx); err != nil {
+ log.G(ctx).WithError(err).Errorf("Failed to kill container %q", p.id)
+ }
+ status = internalErrorCode
+ }
+ ExitCh <- Exit{
+ Timestamp: time.Now(),
+ ID: p.id,
+ Status: status,
+ }
+ }()
+ return nil
+}
+
+// SetExited set the exit stauts of the init process.
+func (p *Init) SetExited(status int) {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ p.initState.SetExited(status)
+}
+
+func (p *Init) setExited(status int) {
+ p.exited = time.Now()
+ p.status = status
+ p.Platform.ShutdownConsole(context.Background(), p.console)
+ close(p.waitBlock)
+}
+
+// Delete deletes the init process.
+func (p *Init) Delete(ctx context.Context) error {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.initState.Delete(ctx)
+}
+
+func (p *Init) delete(ctx context.Context) error {
+ p.killAll(ctx)
+ p.wg.Wait()
+ err := p.runtime.Delete(ctx, p.id, nil)
+ // ignore errors if a runtime has already deleted the process
+ // but we still hold metadata and pipes
+ //
+ // this is common during a checkpoint, runc will delete the container state
+ // after a checkpoint and the container will no longer exist within runc
+ if err != nil {
+ if strings.Contains(err.Error(), "does not exist") {
+ err = nil
+ } else {
+ err = p.runtimeError(err, "failed to delete task")
+ }
+ }
+ if p.io != nil {
+ for _, c := range p.closers {
+ c.Close()
+ }
+ p.io.Close()
+ }
+ if err2 := mount.UnmountAll(p.Rootfs, 0); err2 != nil {
+ log.G(ctx).WithError(err2).Warn("failed to cleanup rootfs mount")
+ if err == nil {
+ err = fmt.Errorf("failed rootfs umount: %w", err2)
+ }
+ }
+ return err
+}
+
+// Resize resizes the init processes console.
+func (p *Init) Resize(ws console.WinSize) error {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ if p.console == nil {
+ return nil
+ }
+ return p.console.Resize(ws)
+}
+
+func (p *Init) resize(ws console.WinSize) error {
+ if p.console == nil {
+ return nil
+ }
+ return p.console.Resize(ws)
+}
+
+// Kill kills the init process.
+func (p *Init) Kill(ctx context.Context, signal uint32, all bool) error {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.initState.Kill(ctx, signal, all)
+}
+
+func (p *Init) kill(context context.Context, signal uint32, all bool) error {
+ var (
+ killErr error
+ backoff = 100 * time.Millisecond
+ )
+ timeout := 1 * time.Second
+ for start := time.Now(); time.Now().Sub(start) < timeout; {
+ c, err := p.runtime.State(context, p.id)
+ if err != nil {
+ if strings.Contains(err.Error(), "does not exist") {
+ return fmt.Errorf("no such process: %w", errdefs.ErrNotFound)
+ }
+ return p.runtimeError(err, "OCI runtime state failed")
+ }
+ // For runsc, signal only works when container is running state.
+ // If the container is not in running state, directly return
+ // "no such process"
+ if p.convertStatus(c.Status) == "stopped" {
+ return fmt.Errorf("no such process: %w", errdefs.ErrNotFound)
+ }
+ killErr = p.runtime.Kill(context, p.id, int(signal), &runsc.KillOpts{
+ All: all,
+ })
+ if killErr == nil {
+ return nil
+ }
+ time.Sleep(backoff)
+ backoff *= 2
+ }
+ return p.runtimeError(killErr, "kill timeout")
+}
+
+// KillAll kills all processes belonging to the init process.
+func (p *Init) KillAll(context context.Context) error {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ return p.killAll(context)
+}
+
+func (p *Init) killAll(context context.Context) error {
+ p.runtime.Kill(context, p.id, int(syscall.SIGKILL), &runsc.KillOpts{
+ All: true,
+ })
+ // Ignore error handling for `runsc kill --all` for now.
+ // * If it doesn't return error, it is good;
+ // * If it returns error, consider the container has already stopped.
+ // TODO: Fix `runsc kill --all` error handling.
+ return nil
+}
+
+// Stdin returns the stdin of the process.
+func (p *Init) Stdin() io.Closer {
+ return p.stdin
+}
+
+// Runtime returns the OCI runtime configured for the init process.
+func (p *Init) Runtime() *runsc.Runsc {
+ return p.runtime
+}
+
+// Exec returns a new child process.
+func (p *Init) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ return p.initState.Exec(ctx, path, r)
+}
+
+// exec returns a new exec'd process.
+func (p *Init) exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) {
+ // process exec request
+ var spec specs.Process
+ if err := json.Unmarshal(r.Spec.Value, &spec); err != nil {
+ return nil, err
+ }
+ spec.Terminal = r.Terminal
+
+ e := &execProcess{
+ id: r.ID,
+ path: path,
+ parent: p,
+ spec: spec,
+ stdio: stdio.Stdio{
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Terminal: r.Terminal,
+ },
+ waitBlock: make(chan struct{}),
+ }
+ e.execState = &execCreatedState{p: e}
+ return e, nil
+}
+
+// Stdio returns the stdio of the process.
+func (p *Init) Stdio() stdio.Stdio {
+ return p.stdio
+}
+
+func (p *Init) runtimeError(rErr error, msg string) error {
+ if rErr == nil {
+ return nil
+ }
+
+ rMsg, err := getLastRuntimeError(p.runtime)
+ switch {
+ case err != nil:
+ return fmt.Errorf("%s: %w (unable to retrieve OCI runtime error: %v)", msg, rErr, err)
+ case rMsg == "":
+ return fmt.Errorf("%s: %w", msg, rErr)
+ default:
+ return fmt.Errorf("%s: %s", msg, rMsg)
+ }
+}
+
+func (p *Init) convertStatus(status string) string {
+ if status == "created" && !p.Sandbox && p.status == internalErrorCode {
+ // Treat start failure state for non-root container as stopped.
+ return "stopped"
+ }
+ return status
+}
+
+func withConditionalIO(c stdio.Stdio) runc.IOOpt {
+ return func(o *runc.IOOption) {
+ o.OpenStdin = c.Stdin != ""
+ o.OpenStdout = c.Stdout != ""
+ o.OpenStderr = c.Stderr != ""
+ }
+}
diff --git a/pkg/shim/v1/proc/init_state.go b/pkg/shim/v1/proc/init_state.go
new file mode 100644
index 000000000..9233ecc85
--- /dev/null
+++ b/pkg/shim/v1/proc/init_state.go
@@ -0,0 +1,182 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/containerd/console"
+ "github.com/containerd/containerd/errdefs"
+ "github.com/containerd/containerd/pkg/process"
+)
+
+type initState interface {
+ Resize(console.WinSize) error
+ Start(context.Context) error
+ Delete(context.Context) error
+ Exec(context.Context, string, *ExecConfig) (process.Process, error)
+ Kill(context.Context, uint32, bool) error
+ SetExited(int)
+}
+
+type createdState struct {
+ p *Init
+}
+
+func (s *createdState) transition(name string) error {
+ switch name {
+ case "running":
+ s.p.initState = &runningState{p: s.p}
+ case "stopped":
+ s.p.initState = &stoppedState{p: s.p}
+ case "deleted":
+ s.p.initState = &deletedState{}
+ default:
+ return fmt.Errorf("invalid state transition %q to %q", stateName(s), name)
+ }
+ return nil
+}
+
+func (s *createdState) Resize(ws console.WinSize) error {
+ return s.p.resize(ws)
+}
+
+func (s *createdState) Start(ctx context.Context) error {
+ if err := s.p.start(ctx); err != nil {
+ // Containerd doesn't allow deleting container in created state.
+ // However, for gvisor, a non-root container in created state can
+ // only go to running state. If the container can't be started,
+ // it can only stay in created state, and never be deleted.
+ // To work around that, we treat non-root container in start failure
+ // state as stopped.
+ if !s.p.Sandbox {
+ s.p.io.Close()
+ s.p.setExited(internalErrorCode)
+ if err := s.transition("stopped"); err != nil {
+ panic(err)
+ }
+ }
+ return err
+ }
+ return s.transition("running")
+}
+
+func (s *createdState) Delete(ctx context.Context) error {
+ if err := s.p.delete(ctx); err != nil {
+ return err
+ }
+ return s.transition("deleted")
+}
+
+func (s *createdState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return s.p.kill(ctx, sig, all)
+}
+
+func (s *createdState) SetExited(status int) {
+ s.p.setExited(status)
+
+ if err := s.transition("stopped"); err != nil {
+ panic(err)
+ }
+}
+
+func (s *createdState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) {
+ return s.p.exec(ctx, path, r)
+}
+
+type runningState struct {
+ p *Init
+}
+
+func (s *runningState) transition(name string) error {
+ switch name {
+ case "stopped":
+ s.p.initState = &stoppedState{p: s.p}
+ default:
+ return fmt.Errorf("invalid state transition %q to %q", stateName(s), name)
+ }
+ return nil
+}
+
+func (s *runningState) Resize(ws console.WinSize) error {
+ return s.p.resize(ws)
+}
+
+func (s *runningState) Start(ctx context.Context) error {
+ return fmt.Errorf("cannot start a running process.ss")
+}
+
+func (s *runningState) Delete(ctx context.Context) error {
+ return fmt.Errorf("cannot delete a running process.ss")
+}
+
+func (s *runningState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return s.p.kill(ctx, sig, all)
+}
+
+func (s *runningState) SetExited(status int) {
+ s.p.setExited(status)
+
+ if err := s.transition("stopped"); err != nil {
+ panic(err)
+ }
+}
+
+func (s *runningState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) {
+ return s.p.exec(ctx, path, r)
+}
+
+type stoppedState struct {
+ p *Init
+}
+
+func (s *stoppedState) transition(name string) error {
+ switch name {
+ case "deleted":
+ s.p.initState = &deletedState{}
+ default:
+ return fmt.Errorf("invalid state transition %q to %q", stateName(s), name)
+ }
+ return nil
+}
+
+func (s *stoppedState) Resize(ws console.WinSize) error {
+ return fmt.Errorf("cannot resize a stopped container")
+}
+
+func (s *stoppedState) Start(ctx context.Context) error {
+ return fmt.Errorf("cannot start a stopped process.ss")
+}
+
+func (s *stoppedState) Delete(ctx context.Context) error {
+ if err := s.p.delete(ctx); err != nil {
+ return err
+ }
+ return s.transition("deleted")
+}
+
+func (s *stoppedState) Kill(ctx context.Context, sig uint32, all bool) error {
+ return errdefs.ToGRPCf(errdefs.ErrNotFound, "process.ss %s not found", s.p.id)
+}
+
+func (s *stoppedState) SetExited(status int) {
+ // no op
+}
+
+func (s *stoppedState) Exec(ctx context.Context, path string, r *ExecConfig) (process.Process, error) {
+ return nil, fmt.Errorf("cannot exec in a stopped state")
+}
diff --git a/pkg/shim/v1/proc/io.go b/pkg/shim/v1/proc/io.go
new file mode 100644
index 000000000..34d825fb7
--- /dev/null
+++ b/pkg/shim/v1/proc/io.go
@@ -0,0 +1,162 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "os"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "github.com/containerd/containerd/log"
+ "github.com/containerd/fifo"
+ runc "github.com/containerd/go-runc"
+)
+
+// TODO(random-liu): This file can be a util.
+
+var bufPool = sync.Pool{
+ New: func() interface{} {
+ buffer := make([]byte, 32<<10)
+ return &buffer
+ },
+}
+
+func copyPipes(ctx context.Context, rio runc.IO, stdin, stdout, stderr string, wg *sync.WaitGroup) error {
+ var sameFile *countingWriteCloser
+ for _, i := range []struct {
+ name string
+ dest func(wc io.WriteCloser, rc io.Closer)
+ }{
+ {
+ name: stdout,
+ dest: func(wc io.WriteCloser, rc io.Closer) {
+ wg.Add(1)
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ if _, err := io.CopyBuffer(wc, rio.Stdout(), *p); err != nil {
+ log.G(ctx).Warn("error copying stdout")
+ }
+ wg.Done()
+ wc.Close()
+ if rc != nil {
+ rc.Close()
+ }
+ }()
+ },
+ }, {
+ name: stderr,
+ dest: func(wc io.WriteCloser, rc io.Closer) {
+ wg.Add(1)
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ if _, err := io.CopyBuffer(wc, rio.Stderr(), *p); err != nil {
+ log.G(ctx).Warn("error copying stderr")
+ }
+ wg.Done()
+ wc.Close()
+ if rc != nil {
+ rc.Close()
+ }
+ }()
+ },
+ },
+ } {
+ ok, err := isFifo(i.name)
+ if err != nil {
+ return err
+ }
+ var (
+ fw io.WriteCloser
+ fr io.Closer
+ )
+ if ok {
+ if fw, err = fifo.OpenFifo(ctx, i.name, syscall.O_WRONLY, 0); err != nil {
+ return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", i.name, err)
+ }
+ if fr, err = fifo.OpenFifo(ctx, i.name, syscall.O_RDONLY, 0); err != nil {
+ return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", i.name, err)
+ }
+ } else {
+ if sameFile != nil {
+ sameFile.count++
+ i.dest(sameFile, nil)
+ continue
+ }
+ if fw, err = os.OpenFile(i.name, syscall.O_WRONLY|syscall.O_APPEND, 0); err != nil {
+ return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", i.name, err)
+ }
+ if stdout == stderr {
+ sameFile = &countingWriteCloser{
+ WriteCloser: fw,
+ count: 1,
+ }
+ }
+ }
+ i.dest(fw, fr)
+ }
+ if stdin == "" {
+ return nil
+ }
+ f, err := fifo.OpenFifo(context.Background(), stdin, syscall.O_RDONLY|syscall.O_NONBLOCK, 0)
+ if err != nil {
+ return fmt.Errorf("gvisor-containerd-shim: opening %s failed: %s", stdin, err)
+ }
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+
+ io.CopyBuffer(rio.Stdin(), f, *p)
+ rio.Stdin().Close()
+ f.Close()
+ }()
+ return nil
+}
+
+// countingWriteCloser masks io.Closer() until close has been invoked a certain number of times.
+type countingWriteCloser struct {
+ io.WriteCloser
+ count int64
+}
+
+func (c *countingWriteCloser) Close() error {
+ if atomic.AddInt64(&c.count, -1) > 0 {
+ return nil
+ }
+ return c.WriteCloser.Close()
+}
+
+// isFifo checks if a file is a fifo.
+//
+// If the file does not exist then it returns false.
+func isFifo(path string) (bool, error) {
+ stat, err := os.Stat(path)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return false, nil
+ }
+ return false, err
+ }
+ if stat.Mode()&os.ModeNamedPipe == os.ModeNamedPipe {
+ return true, nil
+ }
+ return false, nil
+}
diff --git a/pkg/shim/v1/proc/process.go b/pkg/shim/v1/proc/process.go
new file mode 100644
index 000000000..d462c3eef
--- /dev/null
+++ b/pkg/shim/v1/proc/process.go
@@ -0,0 +1,37 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "fmt"
+)
+
+// RunscRoot is the path to the root runsc state directory.
+const RunscRoot = "/run/containerd/runsc"
+
+func stateName(v interface{}) string {
+ switch v.(type) {
+ case *runningState, *execRunningState:
+ return "running"
+ case *createdState, *execCreatedState:
+ return "created"
+ case *deletedState:
+ return "deleted"
+ case *stoppedState:
+ return "stopped"
+ }
+ panic(fmt.Errorf("invalid state %v", v))
+}
diff --git a/pkg/shim/v1/proc/types.go b/pkg/shim/v1/proc/types.go
new file mode 100644
index 000000000..2b0df4663
--- /dev/null
+++ b/pkg/shim/v1/proc/types.go
@@ -0,0 +1,69 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "time"
+
+ runc "github.com/containerd/go-runc"
+ "github.com/gogo/protobuf/types"
+)
+
+// Mount holds filesystem mount configuration.
+type Mount struct {
+ Type string
+ Source string
+ Target string
+ Options []string
+}
+
+// CreateConfig hold task creation configuration.
+type CreateConfig struct {
+ ID string
+ Bundle string
+ Runtime string
+ Rootfs []Mount
+ Terminal bool
+ Stdin string
+ Stdout string
+ Stderr string
+ Options *types.Any
+}
+
+// ExecConfig holds exec creation configuration.
+type ExecConfig struct {
+ ID string
+ Terminal bool
+ Stdin string
+ Stdout string
+ Stderr string
+ Spec *types.Any
+}
+
+// Exit is the type of exit events.
+type Exit struct {
+ Timestamp time.Time
+ ID string
+ Status int
+}
+
+// ProcessMonitor monitors process exit changes.
+type ProcessMonitor interface {
+ // Subscribe to process exit changes
+ Subscribe() chan runc.Exit
+ // Unsubscribe to process exit changes
+ Unsubscribe(c chan runc.Exit)
+}
diff --git a/pkg/shim/v1/proc/utils.go b/pkg/shim/v1/proc/utils.go
new file mode 100644
index 000000000..716de2f59
--- /dev/null
+++ b/pkg/shim/v1/proc/utils.go
@@ -0,0 +1,90 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "encoding/json"
+ "io"
+ "os"
+ "strings"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/shim/runsc"
+)
+
+const (
+ internalErrorCode = 128
+ bufferSize = 32
+)
+
+// ExitCh is the exit events channel for containers and exec processes
+// inside the sandbox.
+var ExitCh = make(chan Exit, bufferSize)
+
+// TODO(mlaventure): move to runc package?
+func getLastRuntimeError(r *runsc.Runsc) (string, error) {
+ if r.Log == "" {
+ return "", nil
+ }
+
+ f, err := os.OpenFile(r.Log, os.O_RDONLY, 0400)
+ if err != nil {
+ return "", err
+ }
+
+ var (
+ errMsg string
+ log struct {
+ Level string
+ Msg string
+ Time time.Time
+ }
+ )
+
+ dec := json.NewDecoder(f)
+ for err = nil; err == nil; {
+ if err = dec.Decode(&log); err != nil && err != io.EOF {
+ return "", err
+ }
+ if log.Level == "error" {
+ errMsg = strings.TrimSpace(log.Msg)
+ }
+ }
+
+ return errMsg, nil
+}
+
+func copyFile(to, from string) error {
+ ff, err := os.Open(from)
+ if err != nil {
+ return err
+ }
+ defer ff.Close()
+ tt, err := os.Create(to)
+ if err != nil {
+ return err
+ }
+ defer tt.Close()
+
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ _, err = io.CopyBuffer(tt, ff, *p)
+ return err
+}
+
+func hasNoIO(r *CreateConfig) bool {
+ return r.Stdin == "" && r.Stdout == "" && r.Stderr == ""
+}
diff --git a/pkg/shim/v1/shim/BUILD b/pkg/shim/v1/shim/BUILD
new file mode 100644
index 000000000..05c595bc9
--- /dev/null
+++ b/pkg/shim/v1/shim/BUILD
@@ -0,0 +1,40 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "shim",
+ srcs = [
+ "api.go",
+ "platform.go",
+ "service.go",
+ ],
+ visibility = [
+ "//pkg/shim:__subpackages__",
+ "//shim:__subpackages__",
+ ],
+ deps = [
+ "//pkg/shim/runsc",
+ "//pkg/shim/v1/proc",
+ "//pkg/shim/v1/utils",
+ "@com_github_containerd_console//:go_default_library",
+ "@com_github_containerd_containerd//api/events:go_default_library",
+ "@com_github_containerd_containerd//api/types/task:go_default_library",
+ "@com_github_containerd_containerd//errdefs:go_default_library",
+ "@com_github_containerd_containerd//events:go_default_library",
+ "@com_github_containerd_containerd//log:go_default_library",
+ "@com_github_containerd_containerd//mount:go_default_library",
+ "@com_github_containerd_containerd//namespaces:go_default_library",
+ "@com_github_containerd_containerd//pkg/process:go_default_library",
+ "@com_github_containerd_containerd//pkg/stdio:go_default_library",
+ "@com_github_containerd_containerd//runtime:go_default_library",
+ "@com_github_containerd_containerd//runtime/linux/runctypes:go_default_library",
+ "@com_github_containerd_containerd//runtime/v1/shim/v1:go_default_library",
+ "@com_github_containerd_containerd//sys/reaper:go_default_library",
+ "@com_github_containerd_fifo//:go_default_library",
+ "@com_github_containerd_typeurl//:go_default_library",
+ "@com_github_gogo_protobuf//types:go_default_library",
+ "@org_golang_google_grpc//codes:go_default_library",
+ "@org_golang_google_grpc//status:go_default_library",
+ ],
+)
diff --git a/pkg/shim/v1/shim/api.go b/pkg/shim/v1/shim/api.go
new file mode 100644
index 000000000..5dd8ff172
--- /dev/null
+++ b/pkg/shim/v1/shim/api.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shim
+
+import (
+ "github.com/containerd/containerd/api/events"
+)
+
+type TaskCreate = events.TaskCreate
+type TaskStart = events.TaskStart
+type TaskOOM = events.TaskOOM
+type TaskExit = events.TaskExit
+type TaskDelete = events.TaskDelete
+type TaskExecAdded = events.TaskExecAdded
+type TaskExecStarted = events.TaskExecStarted
diff --git a/pkg/shim/v1/shim/platform.go b/pkg/shim/v1/shim/platform.go
new file mode 100644
index 000000000..f590f80ef
--- /dev/null
+++ b/pkg/shim/v1/shim/platform.go
@@ -0,0 +1,106 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shim
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "sync"
+ "syscall"
+
+ "github.com/containerd/console"
+ "github.com/containerd/fifo"
+)
+
+type linuxPlatform struct {
+ epoller *console.Epoller
+}
+
+func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console, stdin, stdout, stderr string, wg *sync.WaitGroup) (console.Console, error) {
+ if p.epoller == nil {
+ return nil, fmt.Errorf("uninitialized epoller")
+ }
+
+ epollConsole, err := p.epoller.Add(console)
+ if err != nil {
+ return nil, err
+ }
+
+ if stdin != "" {
+ in, err := fifo.OpenFifo(ctx, stdin, syscall.O_RDONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ io.CopyBuffer(epollConsole, in, *p)
+ }()
+ }
+
+ outw, err := fifo.OpenFifo(ctx, stdout, syscall.O_WRONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ outr, err := fifo.OpenFifo(ctx, stdout, syscall.O_RDONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ wg.Add(1)
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ io.CopyBuffer(outw, epollConsole, *p)
+ epollConsole.Close()
+ outr.Close()
+ outw.Close()
+ wg.Done()
+ }()
+ return epollConsole, nil
+}
+
+func (p *linuxPlatform) ShutdownConsole(ctx context.Context, cons console.Console) error {
+ if p.epoller == nil {
+ return fmt.Errorf("uninitialized epoller")
+ }
+ epollConsole, ok := cons.(*console.EpollConsole)
+ if !ok {
+ return fmt.Errorf("expected EpollConsole, got %#v", cons)
+ }
+ return epollConsole.Shutdown(p.epoller.CloseConsole)
+}
+
+func (p *linuxPlatform) Close() error {
+ return p.epoller.Close()
+}
+
+// initialize a single epoll fd to manage our consoles. `initPlatform` should
+// only be called once.
+func (s *Service) initPlatform() error {
+ if s.platform != nil {
+ return nil
+ }
+ epoller, err := console.NewEpoller()
+ if err != nil {
+ return fmt.Errorf("failed to initialize epoller: %w", err)
+ }
+ s.platform = &linuxPlatform{
+ epoller: epoller,
+ }
+ go epoller.Wait()
+ return nil
+}
diff --git a/pkg/shim/v1/shim/service.go b/pkg/shim/v1/shim/service.go
new file mode 100644
index 000000000..84a810cb2
--- /dev/null
+++ b/pkg/shim/v1/shim/service.go
@@ -0,0 +1,573 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shim
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "path/filepath"
+ "sync"
+
+ "github.com/containerd/console"
+ "github.com/containerd/containerd/api/types/task"
+ "github.com/containerd/containerd/errdefs"
+ "github.com/containerd/containerd/events"
+ "github.com/containerd/containerd/log"
+ "github.com/containerd/containerd/mount"
+ "github.com/containerd/containerd/namespaces"
+ "github.com/containerd/containerd/pkg/process"
+ "github.com/containerd/containerd/pkg/stdio"
+ "github.com/containerd/containerd/runtime"
+ "github.com/containerd/containerd/runtime/linux/runctypes"
+ shim "github.com/containerd/containerd/runtime/v1/shim/v1"
+ "github.com/containerd/containerd/sys/reaper"
+ "github.com/containerd/typeurl"
+ "github.com/gogo/protobuf/types"
+ "google.golang.org/grpc/codes"
+ "google.golang.org/grpc/status"
+
+ "gvisor.dev/gvisor/pkg/shim/runsc"
+ "gvisor.dev/gvisor/pkg/shim/v1/proc"
+ "gvisor.dev/gvisor/pkg/shim/v1/utils"
+)
+
+var (
+ empty = &types.Empty{}
+ bufPool = sync.Pool{
+ New: func() interface{} {
+ buffer := make([]byte, 32<<10)
+ return &buffer
+ },
+ }
+)
+
+// Config contains shim specific configuration.
+type Config struct {
+ Path string
+ Namespace string
+ WorkDir string
+ RuntimeRoot string
+ RunscConfig map[string]string
+}
+
+// NewService returns a new shim service that can be used via GRPC.
+func NewService(config Config, publisher events.Publisher) (*Service, error) {
+ if config.Namespace == "" {
+ return nil, fmt.Errorf("shim namespace cannot be empty")
+ }
+ ctx := namespaces.WithNamespace(context.Background(), config.Namespace)
+ s := &Service{
+ config: config,
+ context: ctx,
+ processes: make(map[string]process.Process),
+ events: make(chan interface{}, 128),
+ ec: proc.ExitCh,
+ }
+ go s.processExits()
+ if err := s.initPlatform(); err != nil {
+ return nil, fmt.Errorf("failed to initialized platform behavior: %w", err)
+ }
+ go s.forward(publisher)
+ return s, nil
+}
+
+// Service is the shim implementation of a remote shim over GRPC.
+type Service struct {
+ mu sync.Mutex
+
+ config Config
+ context context.Context
+ processes map[string]process.Process
+ events chan interface{}
+ platform stdio.Platform
+ ec chan proc.Exit
+
+ // Filled by Create()
+ id string
+ bundle string
+}
+
+// Create creates a new initial process and container with the underlying OCI runtime.
+func (s *Service) Create(ctx context.Context, r *shim.CreateTaskRequest) (_ *shim.CreateTaskResponse, err error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ var mounts []proc.Mount
+ for _, m := range r.Rootfs {
+ mounts = append(mounts, proc.Mount{
+ Type: m.Type,
+ Source: m.Source,
+ Target: m.Target,
+ Options: m.Options,
+ })
+ }
+
+ rootfs := filepath.Join(r.Bundle, "rootfs")
+ if err := os.Mkdir(rootfs, 0711); err != nil && !os.IsExist(err) {
+ return nil, err
+ }
+
+ config := &proc.CreateConfig{
+ ID: r.ID,
+ Bundle: r.Bundle,
+ Runtime: r.Runtime,
+ Rootfs: mounts,
+ Terminal: r.Terminal,
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Options: r.Options,
+ }
+ defer func() {
+ if err != nil {
+ if err2 := mount.UnmountAll(rootfs, 0); err2 != nil {
+ log.G(ctx).WithError(err2).Warn("Failed to cleanup rootfs mount")
+ }
+ }
+ }()
+ for _, rm := range mounts {
+ m := &mount.Mount{
+ Type: rm.Type,
+ Source: rm.Source,
+ Options: rm.Options,
+ }
+ if err := m.Mount(rootfs); err != nil {
+ return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err)
+ }
+ }
+ process, err := newInit(
+ ctx,
+ s.config.Path,
+ s.config.WorkDir,
+ s.config.RuntimeRoot,
+ s.config.Namespace,
+ s.config.RunscConfig,
+ s.platform,
+ config,
+ )
+ if err := process.Create(ctx, config); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ // Save the main task id and bundle to the shim for additional
+ // requests.
+ s.id = r.ID
+ s.bundle = r.Bundle
+ pid := process.Pid()
+ s.processes[r.ID] = process
+ return &shim.CreateTaskResponse{
+ Pid: uint32(pid),
+ }, nil
+}
+
+// Start starts a process.
+func (s *Service) Start(ctx context.Context, r *shim.StartRequest) (*shim.StartResponse, error) {
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Start(ctx); err != nil {
+ return nil, err
+ }
+ return &shim.StartResponse{
+ ID: p.ID(),
+ Pid: uint32(p.Pid()),
+ }, nil
+}
+
+// Delete deletes the initial process and container.
+func (s *Service) Delete(ctx context.Context, r *types.Empty) (*shim.DeleteResponse, error) {
+ p, err := s.getInitProcess()
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Delete(ctx); err != nil {
+ return nil, err
+ }
+ s.mu.Lock()
+ delete(s.processes, s.id)
+ s.mu.Unlock()
+ s.platform.Close()
+ return &shim.DeleteResponse{
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ Pid: uint32(p.Pid()),
+ }, nil
+}
+
+// DeleteProcess deletes an exec'd process.
+func (s *Service) DeleteProcess(ctx context.Context, r *shim.DeleteProcessRequest) (*shim.DeleteResponse, error) {
+ if r.ID == s.id {
+ return nil, status.Errorf(codes.InvalidArgument, "cannot delete init process with DeleteProcess")
+ }
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Delete(ctx); err != nil {
+ return nil, err
+ }
+ s.mu.Lock()
+ delete(s.processes, r.ID)
+ s.mu.Unlock()
+ return &shim.DeleteResponse{
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ Pid: uint32(p.Pid()),
+ }, nil
+}
+
+// Exec spawns an additional process inside the container.
+func (s *Service) Exec(ctx context.Context, r *shim.ExecProcessRequest) (*types.Empty, error) {
+ s.mu.Lock()
+
+ if p := s.processes[r.ID]; p != nil {
+ s.mu.Unlock()
+ return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ID)
+ }
+
+ p := s.processes[s.id]
+ s.mu.Unlock()
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
+ }
+
+ process, err := p.(*proc.Init).Exec(ctx, s.config.Path, &proc.ExecConfig{
+ ID: r.ID,
+ Terminal: r.Terminal,
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Spec: r.Spec,
+ })
+ if err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ s.mu.Lock()
+ s.processes[r.ID] = process
+ s.mu.Unlock()
+ return empty, nil
+}
+
+// ResizePty resises the terminal of a process.
+func (s *Service) ResizePty(ctx context.Context, r *shim.ResizePtyRequest) (*types.Empty, error) {
+ if r.ID == "" {
+ return nil, errdefs.ToGRPCf(errdefs.ErrInvalidArgument, "id not provided")
+ }
+ ws := console.WinSize{
+ Width: uint16(r.Width),
+ Height: uint16(r.Height),
+ }
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Resize(ws); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ return empty, nil
+}
+
+// State returns runtime state information for a process.
+func (s *Service) State(ctx context.Context, r *shim.StateRequest) (*shim.StateResponse, error) {
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ st, err := p.Status(ctx)
+ if err != nil {
+ return nil, err
+ }
+ status := task.StatusUnknown
+ switch st {
+ case "created":
+ status = task.StatusCreated
+ case "running":
+ status = task.StatusRunning
+ case "stopped":
+ status = task.StatusStopped
+ }
+ sio := p.Stdio()
+ return &shim.StateResponse{
+ ID: p.ID(),
+ Bundle: s.bundle,
+ Pid: uint32(p.Pid()),
+ Status: status,
+ Stdin: sio.Stdin,
+ Stdout: sio.Stdout,
+ Stderr: sio.Stderr,
+ Terminal: sio.Terminal,
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ }, nil
+}
+
+// Pause pauses the container.
+func (s *Service) Pause(ctx context.Context, r *types.Empty) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Resume resumes the container.
+func (s *Service) Resume(ctx context.Context, r *types.Empty) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Kill kills a process with the provided signal.
+func (s *Service) Kill(ctx context.Context, r *shim.KillRequest) (*types.Empty, error) {
+ if r.ID == "" {
+ p, err := s.getInitProcess()
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Kill(ctx, r.Signal, r.All); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ return empty, nil
+ }
+
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Kill(ctx, r.Signal, r.All); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ return empty, nil
+}
+
+// ListPids returns all pids inside the container.
+func (s *Service) ListPids(ctx context.Context, r *shim.ListPidsRequest) (*shim.ListPidsResponse, error) {
+ pids, err := s.getContainerPids(ctx, r.ID)
+ if err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ var processes []*task.ProcessInfo
+ for _, pid := range pids {
+ pInfo := task.ProcessInfo{
+ Pid: pid,
+ }
+ for _, p := range s.processes {
+ if p.Pid() == int(pid) {
+ d := &runctypes.ProcessDetails{
+ ExecID: p.ID(),
+ }
+ a, err := typeurl.MarshalAny(d)
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err)
+ }
+ pInfo.Info = a
+ break
+ }
+ }
+ processes = append(processes, &pInfo)
+ }
+ return &shim.ListPidsResponse{
+ Processes: processes,
+ }, nil
+}
+
+// CloseIO closes the I/O context of a process.
+func (s *Service) CloseIO(ctx context.Context, r *shim.CloseIORequest) (*types.Empty, error) {
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ if stdin := p.Stdin(); stdin != nil {
+ if err := stdin.Close(); err != nil {
+ return nil, fmt.Errorf("close stdin: %w", err)
+ }
+ }
+ return empty, nil
+}
+
+// Checkpoint checkpoints the container.
+func (s *Service) Checkpoint(ctx context.Context, r *shim.CheckpointTaskRequest) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// ShimInfo returns shim information such as the shim's pid.
+func (s *Service) ShimInfo(ctx context.Context, r *types.Empty) (*shim.ShimInfoResponse, error) {
+ return &shim.ShimInfoResponse{
+ ShimPid: uint32(os.Getpid()),
+ }, nil
+}
+
+// Update updates a running container.
+func (s *Service) Update(ctx context.Context, r *shim.UpdateTaskRequest) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Wait waits for a process to exit.
+func (s *Service) Wait(ctx context.Context, r *shim.WaitRequest) (*shim.WaitResponse, error) {
+ p, err := s.getExecProcess(r.ID)
+ if err != nil {
+ return nil, err
+ }
+ p.Wait()
+
+ return &shim.WaitResponse{
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ }, nil
+}
+
+func (s *Service) processExits() {
+ for e := range s.ec {
+ s.checkProcesses(e)
+ }
+}
+
+func (s *Service) allProcesses() []process.Process {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ res := make([]process.Process, 0, len(s.processes))
+ for _, p := range s.processes {
+ res = append(res, p)
+ }
+ return res
+}
+
+func (s *Service) checkProcesses(e proc.Exit) {
+ for _, p := range s.allProcesses() {
+ if p.ID() == e.ID {
+ if ip, ok := p.(*proc.Init); ok {
+ // Ensure all children are killed.
+ if err := ip.KillAll(s.context); err != nil {
+ log.G(s.context).WithError(err).WithField("id", ip.ID()).
+ Error("failed to kill init's children")
+ }
+ }
+ p.SetExited(e.Status)
+ s.events <- &TaskExit{
+ ContainerID: s.id,
+ ID: p.ID(),
+ Pid: uint32(p.Pid()),
+ ExitStatus: uint32(e.Status),
+ ExitedAt: p.ExitedAt(),
+ }
+ return
+ }
+ }
+}
+
+func (s *Service) getContainerPids(ctx context.Context, id string) ([]uint32, error) {
+ p, err := s.getInitProcess()
+ if err != nil {
+ return nil, err
+ }
+
+ ps, err := p.(*proc.Init).Runtime().Ps(ctx, id)
+ if err != nil {
+ return nil, err
+ }
+ pids := make([]uint32, 0, len(ps))
+ for _, pid := range ps {
+ pids = append(pids, uint32(pid))
+ }
+ return pids, nil
+}
+
+func (s *Service) forward(publisher events.Publisher) {
+ for e := range s.events {
+ if err := publisher.Publish(s.context, getTopic(s.context, e), e); err != nil {
+ log.G(s.context).WithError(err).Error("post event")
+ }
+ }
+}
+
+// getInitProcess returns the init process.
+func (s *Service) getInitProcess() (process.Process, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ p := s.processes[s.id]
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
+ }
+ return p, nil
+}
+
+// getExecProcess returns the given exec process.
+func (s *Service) getExecProcess(id string) (process.Process, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ p := s.processes[id]
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process %s does not exist", id)
+ }
+ return p, nil
+}
+
+func getTopic(ctx context.Context, e interface{}) string {
+ switch e.(type) {
+ case *TaskCreate:
+ return runtime.TaskCreateEventTopic
+ case *TaskStart:
+ return runtime.TaskStartEventTopic
+ case *TaskOOM:
+ return runtime.TaskOOMEventTopic
+ case *TaskExit:
+ return runtime.TaskExitEventTopic
+ case *TaskDelete:
+ return runtime.TaskDeleteEventTopic
+ case *TaskExecAdded:
+ return runtime.TaskExecAddedEventTopic
+ case *TaskExecStarted:
+ return runtime.TaskExecStartedEventTopic
+ default:
+ log.L.Printf("no topic for type %#v", e)
+ }
+ return runtime.TaskUnknownTopic
+}
+
+func newInit(ctx context.Context, path, workDir, runtimeRoot, namespace string, config map[string]string, platform stdio.Platform, r *proc.CreateConfig) (*proc.Init, error) {
+ var options runctypes.CreateOptions
+ if r.Options != nil {
+ v, err := typeurl.UnmarshalAny(r.Options)
+ if err != nil {
+ return nil, err
+ }
+ options = *v.(*runctypes.CreateOptions)
+ }
+
+ spec, err := utils.ReadSpec(r.Bundle)
+ if err != nil {
+ return nil, fmt.Errorf("read oci spec: %w", err)
+ }
+ if err := utils.UpdateVolumeAnnotations(r.Bundle, spec); err != nil {
+ return nil, fmt.Errorf("update volume annotations: %w", err)
+ }
+
+ runsc.FormatLogPath(r.ID, config)
+ rootfs := filepath.Join(path, "rootfs")
+ runtime := proc.NewRunsc(runtimeRoot, path, namespace, r.Runtime, config)
+ p := proc.New(r.ID, runtime, stdio.Stdio{
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Terminal: r.Terminal,
+ })
+ p.Bundle = r.Bundle
+ p.Platform = platform
+ p.Rootfs = rootfs
+ p.WorkDir = workDir
+ p.IoUID = int(options.IoUid)
+ p.IoGID = int(options.IoGid)
+ p.Sandbox = utils.IsSandbox(spec)
+ p.UserLog = utils.UserLogPath(spec)
+ p.Monitor = reaper.Default
+ return p, nil
+}
diff --git a/pkg/shim/v1/utils/BUILD b/pkg/shim/v1/utils/BUILD
new file mode 100644
index 000000000..54a0aabb7
--- /dev/null
+++ b/pkg/shim/v1/utils/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "utils",
+ srcs = [
+ "annotations.go",
+ "utils.go",
+ "volumes.go",
+ ],
+ visibility = [
+ "//pkg/shim:__subpackages__",
+ "//shim:__subpackages__",
+ ],
+ deps = [
+ "@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
+ ],
+)
+
+go_test(
+ name = "utils_test",
+ size = "small",
+ srcs = ["volumes_test.go"],
+ library = ":utils",
+ deps = ["@com_github_opencontainers_runtime_spec//specs-go:go_default_library"],
+)
diff --git a/pkg/shim/v1/utils/annotations.go b/pkg/shim/v1/utils/annotations.go
new file mode 100644
index 000000000..1e9d3f365
--- /dev/null
+++ b/pkg/shim/v1/utils/annotations.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+// Annotations from the CRI annotations package.
+//
+// These are vendor due to import conflicts.
+const (
+ sandboxLogDirAnnotation = "io.kubernetes.cri.sandbox-log-directory"
+ containerTypeAnnotation = "io.kubernetes.cri.container-type"
+ containerTypeSandbox = "sandbox"
+ containerTypeContainer = "container"
+)
diff --git a/pkg/shim/v1/utils/utils.go b/pkg/shim/v1/utils/utils.go
new file mode 100644
index 000000000..07e346654
--- /dev/null
+++ b/pkg/shim/v1/utils/utils.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "encoding/json"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+// ReadSpec reads OCI spec from the bundle directory.
+func ReadSpec(bundle string) (*specs.Spec, error) {
+ f, err := os.Open(filepath.Join(bundle, "config.json"))
+ if err != nil {
+ return nil, err
+ }
+ b, err := ioutil.ReadAll(f)
+ if err != nil {
+ return nil, err
+ }
+ var spec specs.Spec
+ if err := json.Unmarshal(b, &spec); err != nil {
+ return nil, err
+ }
+ return &spec, nil
+}
+
+// IsSandbox checks whether a container is a sandbox container.
+func IsSandbox(spec *specs.Spec) bool {
+ t, ok := spec.Annotations[containerTypeAnnotation]
+ return !ok || t == containerTypeSandbox
+}
+
+// UserLogPath gets user log path from OCI annotation.
+func UserLogPath(spec *specs.Spec) string {
+ sandboxLogDir := spec.Annotations[sandboxLogDirAnnotation]
+ if sandboxLogDir == "" {
+ return ""
+ }
+ return filepath.Join(sandboxLogDir, "gvisor.log")
+}
diff --git a/pkg/shim/v1/utils/volumes.go b/pkg/shim/v1/utils/volumes.go
new file mode 100644
index 000000000..52a428179
--- /dev/null
+++ b/pkg/shim/v1/utils/volumes.go
@@ -0,0 +1,155 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "path/filepath"
+ "strings"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+const volumeKeyPrefix = "dev.gvisor.spec.mount."
+
+var kubeletPodsDir = "/var/lib/kubelet/pods"
+
+// volumeName gets volume name from volume annotation key, example:
+// dev.gvisor.spec.mount.NAME.share
+func volumeName(k string) string {
+ return strings.SplitN(strings.TrimPrefix(k, volumeKeyPrefix), ".", 2)[0]
+}
+
+// volumeFieldName gets volume field name from volume annotation key, example:
+// `type` is the field of dev.gvisor.spec.mount.NAME.type
+func volumeFieldName(k string) string {
+ parts := strings.Split(strings.TrimPrefix(k, volumeKeyPrefix), ".")
+ return parts[len(parts)-1]
+}
+
+// podUID gets pod UID from the pod log path.
+func podUID(s *specs.Spec) (string, error) {
+ sandboxLogDir := s.Annotations[sandboxLogDirAnnotation]
+ if sandboxLogDir == "" {
+ return "", fmt.Errorf("no sandbox log path annotation")
+ }
+ fields := strings.Split(filepath.Base(sandboxLogDir), "_")
+ switch len(fields) {
+ case 1: // This is the old CRI logging path.
+ return fields[0], nil
+ case 3: // This is the new CRI logging path.
+ return fields[2], nil
+ }
+ return "", fmt.Errorf("unexpected sandbox log path %q", sandboxLogDir)
+}
+
+// isVolumeKey checks whether an annotation key is for volume.
+func isVolumeKey(k string) bool {
+ return strings.HasPrefix(k, volumeKeyPrefix)
+}
+
+// volumeSourceKey constructs the annotation key for volume source.
+func volumeSourceKey(volume string) string {
+ return volumeKeyPrefix + volume + ".source"
+}
+
+// volumePath searches the volume path in the kubelet pod directory.
+func volumePath(volume, uid string) (string, error) {
+ // TODO: Support subpath when gvisor supports pod volume bind mount.
+ volumeSearchPath := fmt.Sprintf("%s/%s/volumes/*/%s", kubeletPodsDir, uid, volume)
+ dirs, err := filepath.Glob(volumeSearchPath)
+ if err != nil {
+ return "", err
+ }
+ if len(dirs) != 1 {
+ return "", fmt.Errorf("unexpected matched volume list %v", dirs)
+ }
+ return dirs[0], nil
+}
+
+// isVolumePath checks whether a string is the volume path.
+func isVolumePath(volume, path string) (bool, error) {
+ // TODO: Support subpath when gvisor supports pod volume bind mount.
+ volumeSearchPath := fmt.Sprintf("%s/*/volumes/*/%s", kubeletPodsDir, volume)
+ return filepath.Match(volumeSearchPath, path)
+}
+
+// UpdateVolumeAnnotations add necessary OCI annotations for gvisor
+// volume optimization.
+func UpdateVolumeAnnotations(bundle string, s *specs.Spec) error {
+ var (
+ uid string
+ err error
+ )
+ if IsSandbox(s) {
+ uid, err = podUID(s)
+ if err != nil {
+ // Skip if we can't get pod UID, because this doesn't work
+ // for containerd 1.1.
+ return nil
+ }
+ }
+ var updated bool
+ for k, v := range s.Annotations {
+ if !isVolumeKey(k) {
+ continue
+ }
+ if volumeFieldName(k) != "type" {
+ continue
+ }
+ volume := volumeName(k)
+ if uid != "" {
+ // This is a sandbox.
+ path, err := volumePath(volume, uid)
+ if err != nil {
+ return fmt.Errorf("get volume path for %q: %w", volume, err)
+ }
+ s.Annotations[volumeSourceKey(volume)] = path
+ updated = true
+ } else {
+ // This is a container.
+ for i := range s.Mounts {
+ // An error is returned for sandbox if source
+ // annotation is not successfully applied, so
+ // it is guaranteed that the source annotation
+ // for sandbox has already been successfully
+ // applied at this point.
+ //
+ // The volume name is unique inside a pod, so
+ // matching without podUID is fine here.
+ //
+ // TODO: Pass podUID down to shim for containers to do
+ // more accurate matching.
+ if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes {
+ // gVisor requires the container mount type to match
+ // sandbox mount type.
+ s.Mounts[i].Type = v
+ updated = true
+ }
+ }
+ }
+ }
+ if !updated {
+ return nil
+ }
+ // Update bundle.
+ b, err := json.Marshal(s)
+ if err != nil {
+ return err
+ }
+ return ioutil.WriteFile(filepath.Join(bundle, "config.json"), b, 0666)
+}
diff --git a/pkg/shim/v1/utils/volumes_test.go b/pkg/shim/v1/utils/volumes_test.go
new file mode 100644
index 000000000..3e02c6151
--- /dev/null
+++ b/pkg/shim/v1/utils/volumes_test.go
@@ -0,0 +1,308 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "reflect"
+ "testing"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestUpdateVolumeAnnotations(t *testing.T) {
+ dir, err := ioutil.TempDir("", "test-update-volume-annotations")
+ if err != nil {
+ t.Fatalf("create tempdir: %v", err)
+ }
+ defer os.RemoveAll(dir)
+ kubeletPodsDir = dir
+
+ const (
+ testPodUID = "testuid"
+ testVolumeName = "testvolume"
+ testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID
+ testLegacyLogDirPath = "/var/log/pods/" + testPodUID
+ )
+ testVolumePath := fmt.Sprintf("%s/%s/volumes/kubernetes.io~empty-dir/%s", dir, testPodUID, testVolumeName)
+
+ if err := os.MkdirAll(testVolumePath, 0755); err != nil {
+ t.Fatalf("Create test volume: %v", err)
+ }
+
+ for _, test := range []struct {
+ desc string
+ spec *specs.Spec
+ expected *specs.Spec
+ expectErr bool
+ expectUpdate bool
+ }{
+ {
+ desc: "volume annotations for sandbox",
+ spec: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expected: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ "dev.gvisor.spec.mount." + testVolumeName + ".source": testVolumePath,
+ },
+ },
+ expectUpdate: true,
+ },
+ {
+ desc: "volume annotations for sandbox with legacy log path",
+ spec: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLegacyLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expected: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLegacyLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ "dev.gvisor.spec.mount." + testVolumeName + ".source": testVolumePath,
+ },
+ },
+ expectUpdate: true,
+ },
+ {
+ desc: "tmpfs: volume annotations for container",
+ spec: &specs.Spec{
+ Mounts: []specs.Mount{
+ {
+ Destination: "/test",
+ Type: "bind",
+ Source: testVolumePath,
+ Options: []string{"ro"},
+ },
+ {
+ Destination: "/random",
+ Type: "bind",
+ Source: "/random",
+ Options: []string{"ro"},
+ },
+ },
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeContainer,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expected: &specs.Spec{
+ Mounts: []specs.Mount{
+ {
+ Destination: "/test",
+ Type: "tmpfs",
+ Source: testVolumePath,
+ Options: []string{"ro"},
+ },
+ {
+ Destination: "/random",
+ Type: "bind",
+ Source: "/random",
+ Options: []string{"ro"},
+ },
+ },
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeContainer,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expectUpdate: true,
+ },
+ {
+ desc: "bind: volume annotations for container",
+ spec: &specs.Spec{
+ Mounts: []specs.Mount{
+ {
+ Destination: "/test",
+ Type: "bind",
+ Source: testVolumePath,
+ Options: []string{"ro"},
+ },
+ },
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeContainer,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "container",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "bind",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expected: &specs.Spec{
+ Mounts: []specs.Mount{
+ {
+ Destination: "/test",
+ Type: "bind",
+ Source: testVolumePath,
+ Options: []string{"ro"},
+ },
+ },
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeContainer,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "container",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "bind",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expectUpdate: true,
+ },
+ {
+ desc: "should not return error without pod log directory",
+ spec: &specs.Spec{
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ expected: &specs.Spec{
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount." + testVolumeName + ".share": "pod",
+ "dev.gvisor.spec.mount." + testVolumeName + ".type": "tmpfs",
+ "dev.gvisor.spec.mount." + testVolumeName + ".options": "ro",
+ },
+ },
+ },
+ {
+ desc: "should return error if volume path does not exist",
+ spec: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ "dev.gvisor.spec.mount.notexist.share": "pod",
+ "dev.gvisor.spec.mount.notexist.type": "tmpfs",
+ "dev.gvisor.spec.mount.notexist.options": "ro",
+ },
+ },
+ expectErr: true,
+ },
+ {
+ desc: "no volume annotations for sandbox",
+ spec: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ },
+ },
+ expected: &specs.Spec{
+ Annotations: map[string]string{
+ sandboxLogDirAnnotation: testLogDirPath,
+ containerTypeAnnotation: containerTypeSandbox,
+ },
+ },
+ },
+ {
+ desc: "no volume annotations for container",
+ spec: &specs.Spec{
+ Mounts: []specs.Mount{
+ {
+ Destination: "/test",
+ Type: "bind",
+ Source: "/test",
+ Options: []string{"ro"},
+ },
+ {
+ Destination: "/random",
+ Type: "bind",
+ Source: "/random",
+ Options: []string{"ro"},
+ },
+ },
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeContainer,
+ },
+ },
+ expected: &specs.Spec{
+ Mounts: []specs.Mount{
+ {
+ Destination: "/test",
+ Type: "bind",
+ Source: "/test",
+ Options: []string{"ro"},
+ },
+ {
+ Destination: "/random",
+ Type: "bind",
+ Source: "/random",
+ Options: []string{"ro"},
+ },
+ },
+ Annotations: map[string]string{
+ containerTypeAnnotation: containerTypeContainer,
+ },
+ },
+ },
+ } {
+ t.Run(test.desc, func(t *testing.T) {
+ bundle, err := ioutil.TempDir(dir, "test-bundle")
+ if err != nil {
+ t.Fatalf("Create test bundle: %v", err)
+ }
+ err = UpdateVolumeAnnotations(bundle, test.spec)
+ if test.expectErr {
+ if err == nil {
+ t.Fatal("Expected error, but got nil")
+ }
+ return
+ }
+ if err != nil {
+ t.Fatalf("Unexpected error: %v", err)
+ }
+ if !reflect.DeepEqual(test.expected, test.spec) {
+ t.Fatalf("Expected %+v, got %+v", test.expected, test.spec)
+ }
+ if test.expectUpdate {
+ b, err := ioutil.ReadFile(filepath.Join(bundle, "config.json"))
+ if err != nil {
+ t.Fatalf("Read spec from bundle: %v", err)
+ }
+ var spec specs.Spec
+ if err := json.Unmarshal(b, &spec); err != nil {
+ t.Fatalf("Unmarshal spec: %v", err)
+ }
+ if !reflect.DeepEqual(test.expected, &spec) {
+ t.Fatalf("Expected %+v, got %+v", test.expected, &spec)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/shim/v2/BUILD b/pkg/shim/v2/BUILD
new file mode 100644
index 000000000..7e0a114a0
--- /dev/null
+++ b/pkg/shim/v2/BUILD
@@ -0,0 +1,43 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "v2",
+ srcs = [
+ "api.go",
+ "epoll.go",
+ "service.go",
+ "service_linux.go",
+ ],
+ visibility = ["//shim:__subpackages__"],
+ deps = [
+ "//pkg/shim/runsc",
+ "//pkg/shim/v1/proc",
+ "//pkg/shim/v1/utils",
+ "//pkg/shim/v2/options",
+ "//pkg/shim/v2/runtimeoptions",
+ "//runsc/specutils",
+ "@com_github_burntsushi_toml//:go_default_library",
+ "@com_github_containerd_cgroups//:go_default_library",
+ "@com_github_containerd_console//:go_default_library",
+ "@com_github_containerd_containerd//api/events:go_default_library",
+ "@com_github_containerd_containerd//api/types/task:go_default_library",
+ "@com_github_containerd_containerd//errdefs:go_default_library",
+ "@com_github_containerd_containerd//events:go_default_library",
+ "@com_github_containerd_containerd//log:go_default_library",
+ "@com_github_containerd_containerd//mount:go_default_library",
+ "@com_github_containerd_containerd//namespaces:go_default_library",
+ "@com_github_containerd_containerd//pkg/process:go_default_library",
+ "@com_github_containerd_containerd//pkg/stdio:go_default_library",
+ "@com_github_containerd_containerd//runtime:go_default_library",
+ "@com_github_containerd_containerd//runtime/linux/runctypes:go_default_library",
+ "@com_github_containerd_containerd//runtime/v2/shim:go_default_library",
+ "@com_github_containerd_containerd//runtime/v2/task:go_default_library",
+ "@com_github_containerd_containerd//sys/reaper:go_default_library",
+ "@com_github_containerd_fifo//:go_default_library",
+ "@com_github_containerd_typeurl//:go_default_library",
+ "@com_github_gogo_protobuf//types:go_default_library",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
diff --git a/pkg/shim/v2/api.go b/pkg/shim/v2/api.go
new file mode 100644
index 000000000..dbe5c59f6
--- /dev/null
+++ b/pkg/shim/v2/api.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package v2
+
+import (
+ "github.com/containerd/containerd/api/events"
+)
+
+type TaskOOM = events.TaskOOM
diff --git a/pkg/shim/v2/epoll.go b/pkg/shim/v2/epoll.go
new file mode 100644
index 000000000..41232cca8
--- /dev/null
+++ b/pkg/shim/v2/epoll.go
@@ -0,0 +1,129 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package v2
+
+import (
+ "context"
+ "fmt"
+ "sync"
+
+ "github.com/containerd/cgroups"
+ "github.com/containerd/containerd/events"
+ "github.com/containerd/containerd/runtime"
+ "golang.org/x/sys/unix"
+)
+
+func newOOMEpoller(publisher events.Publisher) (*epoller, error) {
+ fd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC)
+ if err != nil {
+ return nil, err
+ }
+ return &epoller{
+ fd: fd,
+ publisher: publisher,
+ set: make(map[uintptr]*item),
+ }, nil
+}
+
+type epoller struct {
+ mu sync.Mutex
+
+ fd int
+ publisher events.Publisher
+ set map[uintptr]*item
+}
+
+type item struct {
+ id string
+ cg cgroups.Cgroup
+}
+
+func (e *epoller) Close() error {
+ return unix.Close(e.fd)
+}
+
+func (e *epoller) run(ctx context.Context) {
+ var events [128]unix.EpollEvent
+ for {
+ select {
+ case <-ctx.Done():
+ e.Close()
+ return
+ default:
+ n, err := unix.EpollWait(e.fd, events[:], -1)
+ if err != nil {
+ if err == unix.EINTR || err == unix.EAGAIN {
+ continue
+ }
+ // Should not happen.
+ panic(fmt.Errorf("cgroups: epoll wait: %w", err))
+ }
+ for i := 0; i < n; i++ {
+ e.process(ctx, uintptr(events[i].Fd))
+ }
+ }
+ }
+}
+
+func (e *epoller) add(id string, cg cgroups.Cgroup) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ fd, err := cg.OOMEventFD()
+ if err != nil {
+ return err
+ }
+ e.set[fd] = &item{
+ id: id,
+ cg: cg,
+ }
+ event := unix.EpollEvent{
+ Fd: int32(fd),
+ Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR,
+ }
+ return unix.EpollCtl(e.fd, unix.EPOLL_CTL_ADD, int(fd), &event)
+}
+
+func (e *epoller) process(ctx context.Context, fd uintptr) {
+ flush(fd)
+ e.mu.Lock()
+ i, ok := e.set[fd]
+ if !ok {
+ e.mu.Unlock()
+ return
+ }
+ e.mu.Unlock()
+ if i.cg.State() == cgroups.Deleted {
+ e.mu.Lock()
+ delete(e.set, fd)
+ e.mu.Unlock()
+ unix.Close(int(fd))
+ return
+ }
+ if err := e.publisher.Publish(ctx, runtime.TaskOOMEventTopic, &TaskOOM{
+ ContainerID: i.id,
+ }); err != nil {
+ // Should not happen.
+ panic(fmt.Errorf("publish OOM event: %w", err))
+ }
+}
+
+func flush(fd uintptr) error {
+ var buf [8]byte
+ _, err := unix.Read(int(fd), buf[:])
+ return err
+}
diff --git a/pkg/shim/v2/options/BUILD b/pkg/shim/v2/options/BUILD
new file mode 100644
index 000000000..ca212e874
--- /dev/null
+++ b/pkg/shim/v2/options/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "options",
+ srcs = [
+ "options.go",
+ ],
+ visibility = ["//:sandbox"],
+)
diff --git a/pkg/shim/v2/options/options.go b/pkg/shim/v2/options/options.go
new file mode 100644
index 000000000..de09f2f79
--- /dev/null
+++ b/pkg/shim/v2/options/options.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package options
+
+const OptionType = "io.containerd.runsc.v1.options"
+
+// Options is runtime options for io.containerd.runsc.v1.
+type Options struct {
+ // ShimCgroup is the cgroup the shim should be in.
+ ShimCgroup string `toml:"shim_cgroup"`
+ // IoUid is the I/O's pipes uid.
+ IoUid uint32 `toml:"io_uid"`
+ // IoUid is the I/O's pipes gid.
+ IoGid uint32 `toml:"io_gid"`
+ // BinaryName is the binary name of the runsc binary.
+ BinaryName string `toml:"binary_name"`
+ // Root is the runsc root directory.
+ Root string `toml:"root"`
+ // RunscConfig is a key/value map of all runsc flags.
+ RunscConfig map[string]string `toml:"runsc_config"`
+}
diff --git a/pkg/shim/v2/runtimeoptions/BUILD b/pkg/shim/v2/runtimeoptions/BUILD
new file mode 100644
index 000000000..01716034c
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library", "proto_library")
+
+package(licenses = ["notice"])
+
+proto_library(
+ name = "api",
+ srcs = [
+ "runtimeoptions.proto",
+ ],
+)
+
+go_library(
+ name = "runtimeoptions",
+ srcs = ["runtimeoptions.go"],
+ visibility = ["//pkg/shim/v2:__pkg__"],
+ deps = [
+ "//pkg/shim/v2/runtimeoptions:api_go_proto",
+ "@com_github_gogo_protobuf//proto:go_default_library",
+ ],
+)
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.go b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
new file mode 100644
index 000000000..1c1a0c5d1
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package runtimeoptions
+
+import (
+ proto "github.com/gogo/protobuf/proto"
+ pb "gvisor.dev/gvisor/pkg/shim/v2/runtimeoptions/api_go_proto"
+)
+
+type Options = pb.Options
+
+func init() {
+ proto.RegisterType((*Options)(nil), "cri.runtimeoptions.v1.Options")
+}
diff --git a/pkg/shim/v2/runtimeoptions/runtimeoptions.proto b/pkg/shim/v2/runtimeoptions/runtimeoptions.proto
new file mode 100644
index 000000000..edb19020a
--- /dev/null
+++ b/pkg/shim/v2/runtimeoptions/runtimeoptions.proto
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package runtimeoptions;
+
+// This is a version of the runtimeoptions CRI API that is vendored.
+//
+// Imported the full CRI package is a nightmare.
+message Options {
+ string type_url = 1;
+ string config_path = 2;
+}
diff --git a/pkg/shim/v2/service.go b/pkg/shim/v2/service.go
new file mode 100644
index 000000000..1534152fc
--- /dev/null
+++ b/pkg/shim/v2/service.go
@@ -0,0 +1,824 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package v2
+
+import (
+ "context"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/BurntSushi/toml"
+ "github.com/containerd/cgroups"
+ "github.com/containerd/console"
+ "github.com/containerd/containerd/api/events"
+ "github.com/containerd/containerd/api/types/task"
+ "github.com/containerd/containerd/errdefs"
+ "github.com/containerd/containerd/log"
+ "github.com/containerd/containerd/mount"
+ "github.com/containerd/containerd/namespaces"
+ "github.com/containerd/containerd/pkg/process"
+ "github.com/containerd/containerd/pkg/stdio"
+ "github.com/containerd/containerd/runtime"
+ "github.com/containerd/containerd/runtime/linux/runctypes"
+ "github.com/containerd/containerd/runtime/v2/shim"
+ taskAPI "github.com/containerd/containerd/runtime/v2/task"
+ "github.com/containerd/containerd/sys/reaper"
+ "github.com/containerd/typeurl"
+ "github.com/gogo/protobuf/types"
+ "golang.org/x/sys/unix"
+
+ "gvisor.dev/gvisor/pkg/shim/runsc"
+ "gvisor.dev/gvisor/pkg/shim/v1/proc"
+ "gvisor.dev/gvisor/pkg/shim/v1/utils"
+ "gvisor.dev/gvisor/pkg/shim/v2/options"
+ "gvisor.dev/gvisor/pkg/shim/v2/runtimeoptions"
+ "gvisor.dev/gvisor/runsc/specutils"
+)
+
+var (
+ empty = &types.Empty{}
+ bufPool = sync.Pool{
+ New: func() interface{} {
+ buffer := make([]byte, 32<<10)
+ return &buffer
+ },
+ }
+)
+
+var _ = (taskAPI.TaskService)(&service{})
+
+// configFile is the default config file name. For containerd 1.2,
+// we assume that a config.toml should exist in the runtime root.
+const configFile = "config.toml"
+
+// New returns a new shim service that can be used via GRPC.
+func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) {
+ ep, err := newOOMEpoller(publisher)
+ if err != nil {
+ return nil, err
+ }
+ go ep.run(ctx)
+ s := &service{
+ id: id,
+ context: ctx,
+ processes: make(map[string]process.Process),
+ events: make(chan interface{}, 128),
+ ec: proc.ExitCh,
+ oomPoller: ep,
+ cancel: cancel,
+ }
+ go s.processExits()
+ runsc.Monitor = reaper.Default
+ if err := s.initPlatform(); err != nil {
+ cancel()
+ return nil, fmt.Errorf("failed to initialized platform behavior: %w", err)
+ }
+ go s.forward(publisher)
+ return s, nil
+}
+
+// service is the shim implementation of a remote shim over GRPC.
+type service struct {
+ mu sync.Mutex
+
+ context context.Context
+ task process.Process
+ processes map[string]process.Process
+ events chan interface{}
+ platform stdio.Platform
+ opts options.Options
+ ec chan proc.Exit
+ oomPoller *epoller
+
+ id string
+ bundle string
+ cancel func()
+}
+
+func newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) {
+ ns, err := namespaces.NamespaceRequired(ctx)
+ if err != nil {
+ return nil, err
+ }
+ self, err := os.Executable()
+ if err != nil {
+ return nil, err
+ }
+ cwd, err := os.Getwd()
+ if err != nil {
+ return nil, err
+ }
+ args := []string{
+ "-namespace", ns,
+ "-address", containerdAddress,
+ "-publish-binary", containerdBinary,
+ }
+ cmd := exec.Command(self, args...)
+ cmd.Dir = cwd
+ cmd.Env = append(os.Environ(), "GOMAXPROCS=2")
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Setpgid: true,
+ }
+ return cmd, nil
+}
+
+func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (string, error) {
+ cmd, err := newCommand(ctx, containerdBinary, containerdAddress)
+ if err != nil {
+ return "", err
+ }
+ address, err := shim.SocketAddress(ctx, id)
+ if err != nil {
+ return "", err
+ }
+ socket, err := shim.NewSocket(address)
+ if err != nil {
+ return "", err
+ }
+ defer socket.Close()
+ f, err := socket.File()
+ if err != nil {
+ return "", err
+ }
+ defer f.Close()
+
+ cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+
+ if err := cmd.Start(); err != nil {
+ return "", err
+ }
+ defer func() {
+ if err != nil {
+ cmd.Process.Kill()
+ }
+ }()
+ // make sure to wait after start
+ go cmd.Wait()
+ if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil {
+ return "", err
+ }
+ if err := shim.WriteAddress("address", address); err != nil {
+ return "", err
+ }
+ if err := shim.SetScore(cmd.Process.Pid); err != nil {
+ return "", fmt.Errorf("failed to set OOM Score on shim: %w", err)
+ }
+ return address, nil
+}
+
+func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) {
+ path, err := os.Getwd()
+ if err != nil {
+ return nil, err
+ }
+ ns, err := namespaces.NamespaceRequired(ctx)
+ if err != nil {
+ return nil, err
+ }
+ runtime, err := s.readRuntime(path)
+ if err != nil {
+ return nil, err
+ }
+ r := proc.NewRunsc(s.opts.Root, path, ns, runtime, nil)
+ if err := r.Delete(ctx, s.id, &runsc.DeleteOpts{
+ Force: true,
+ }); err != nil {
+ log.L.Printf("failed to remove runc container: %v", err)
+ }
+ if err := mount.UnmountAll(filepath.Join(path, "rootfs"), 0); err != nil {
+ log.L.Printf("failed to cleanup rootfs mount: %v", err)
+ }
+ return &taskAPI.DeleteResponse{
+ ExitedAt: time.Now(),
+ ExitStatus: 128 + uint32(unix.SIGKILL),
+ }, nil
+}
+
+func (s *service) readRuntime(path string) (string, error) {
+ data, err := ioutil.ReadFile(filepath.Join(path, "runtime"))
+ if err != nil {
+ return "", err
+ }
+ return string(data), nil
+}
+
+func (s *service) writeRuntime(path, runtime string) error {
+ return ioutil.WriteFile(filepath.Join(path, "runtime"), []byte(runtime), 0600)
+}
+
+// Create creates a new initial process and container with the underlying OCI
+// runtime.
+func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ *taskAPI.CreateTaskResponse, err error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ ns, err := namespaces.NamespaceRequired(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("create namespace: %w", err)
+ }
+
+ // Read from root for now.
+ var opts options.Options
+ if r.Options != nil {
+ v, err := typeurl.UnmarshalAny(r.Options)
+ if err != nil {
+ return nil, err
+ }
+ var path string
+ switch o := v.(type) {
+ case *runctypes.CreateOptions: // containerd 1.2.x
+ opts.IoUid = o.IoUid
+ opts.IoGid = o.IoGid
+ opts.ShimCgroup = o.ShimCgroup
+ case *runctypes.RuncOptions: // containerd 1.2.x
+ root := proc.RunscRoot
+ if o.RuntimeRoot != "" {
+ root = o.RuntimeRoot
+ }
+
+ opts.BinaryName = o.Runtime
+
+ path = filepath.Join(root, configFile)
+ if _, err := os.Stat(path); err != nil {
+ if !os.IsNotExist(err) {
+ return nil, fmt.Errorf("stat config file %q: %w", path, err)
+ }
+ // A config file in runtime root is not required.
+ path = ""
+ }
+ case *runtimeoptions.Options: // containerd 1.3.x+
+ if o.ConfigPath == "" {
+ break
+ }
+ if o.TypeUrl != options.OptionType {
+ return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl)
+ }
+ path = o.ConfigPath
+ default:
+ return nil, fmt.Errorf("unsupported option type %q", r.Options.TypeUrl)
+ }
+ if path != "" {
+ if _, err = toml.DecodeFile(path, &opts); err != nil {
+ return nil, fmt.Errorf("decode config file %q: %w", path, err)
+ }
+ }
+ }
+
+ var mounts []proc.Mount
+ for _, m := range r.Rootfs {
+ mounts = append(mounts, proc.Mount{
+ Type: m.Type,
+ Source: m.Source,
+ Target: m.Target,
+ Options: m.Options,
+ })
+ }
+
+ rootfs := filepath.Join(r.Bundle, "rootfs")
+ if err := os.Mkdir(rootfs, 0711); err != nil && !os.IsExist(err) {
+ return nil, err
+ }
+
+ config := &proc.CreateConfig{
+ ID: r.ID,
+ Bundle: r.Bundle,
+ Runtime: opts.BinaryName,
+ Rootfs: mounts,
+ Terminal: r.Terminal,
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Options: r.Options,
+ }
+ if err := s.writeRuntime(r.Bundle, opts.BinaryName); err != nil {
+ return nil, err
+ }
+ defer func() {
+ if err != nil {
+ if err := mount.UnmountAll(rootfs, 0); err != nil {
+ log.L.Printf("failed to cleanup rootfs mount: %v", err)
+ }
+ }
+ }()
+ for _, rm := range mounts {
+ m := &mount.Mount{
+ Type: rm.Type,
+ Source: rm.Source,
+ Options: rm.Options,
+ }
+ if err := m.Mount(rootfs); err != nil {
+ return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err)
+ }
+ }
+ process, err := newInit(
+ ctx,
+ r.Bundle,
+ filepath.Join(r.Bundle, "work"),
+ ns,
+ s.platform,
+ config,
+ &opts,
+ rootfs,
+ )
+ if err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ if err := process.Create(ctx, config); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ // Save the main task id and bundle to the shim for additional
+ // requests.
+ s.id = r.ID
+ s.bundle = r.Bundle
+
+ // Set up OOM notification on the sandbox's cgroup. This is done on
+ // sandbox create since the sandbox process will be created here.
+ pid := process.Pid()
+ if pid > 0 {
+ cg, err := cgroups.Load(cgroups.V1, cgroups.PidPath(pid))
+ if err != nil {
+ return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err)
+ }
+ if err := s.oomPoller.add(s.id, cg); err != nil {
+ return nil, fmt.Errorf("add cg to OOM monitor: %w", err)
+ }
+ }
+ s.task = process
+ s.opts = opts
+ return &taskAPI.CreateTaskResponse{
+ Pid: uint32(process.Pid()),
+ }, nil
+
+}
+
+// Start starts a process.
+func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ if err := p.Start(ctx); err != nil {
+ return nil, err
+ }
+ // TODO: Set the cgroup and oom notifications on restore.
+ // https://github.com/google/gvisor-containerd-shim/issues/58
+ return &taskAPI.StartResponse{
+ Pid: uint32(p.Pid()),
+ }, nil
+}
+
+// Delete deletes the initial process and container.
+func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
+ }
+ if err := p.Delete(ctx); err != nil {
+ return nil, err
+ }
+ isTask := r.ExecID == ""
+ if !isTask {
+ s.mu.Lock()
+ delete(s.processes, r.ExecID)
+ s.mu.Unlock()
+ }
+ if isTask && s.platform != nil {
+ s.platform.Close()
+ }
+ return &taskAPI.DeleteResponse{
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ Pid: uint32(p.Pid()),
+ }, nil
+}
+
+// Exec spawns an additional process inside the container.
+func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) {
+ s.mu.Lock()
+ p := s.processes[r.ExecID]
+ s.mu.Unlock()
+ if p != nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID)
+ }
+ p = s.task
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
+ }
+ process, err := p.(*proc.Init).Exec(ctx, s.bundle, &proc.ExecConfig{
+ ID: r.ExecID,
+ Terminal: r.Terminal,
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Spec: r.Spec,
+ })
+ if err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ s.mu.Lock()
+ s.processes[r.ExecID] = process
+ s.mu.Unlock()
+ return empty, nil
+}
+
+// ResizePty resizes the terminal of a process.
+func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ ws := console.WinSize{
+ Width: uint16(r.Width),
+ Height: uint16(r.Height),
+ }
+ if err := p.Resize(ws); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ return empty, nil
+}
+
+// State returns runtime state information for a process.
+func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ st, err := p.Status(ctx)
+ if err != nil {
+ return nil, err
+ }
+ status := task.StatusUnknown
+ switch st {
+ case "created":
+ status = task.StatusCreated
+ case "running":
+ status = task.StatusRunning
+ case "stopped":
+ status = task.StatusStopped
+ }
+ sio := p.Stdio()
+ return &taskAPI.StateResponse{
+ ID: p.ID(),
+ Bundle: s.bundle,
+ Pid: uint32(p.Pid()),
+ Status: status,
+ Stdin: sio.Stdin,
+ Stdout: sio.Stdout,
+ Stderr: sio.Stderr,
+ Terminal: sio.Terminal,
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ }, nil
+}
+
+// Pause the container.
+func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Resume the container.
+func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Kill a process with the provided signal.
+func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
+ }
+ if err := p.Kill(ctx, r.Signal, r.All); err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ return empty, nil
+}
+
+// Pids returns all pids inside the container.
+func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) {
+ pids, err := s.getContainerPids(ctx, r.ID)
+ if err != nil {
+ return nil, errdefs.ToGRPC(err)
+ }
+ var processes []*task.ProcessInfo
+ for _, pid := range pids {
+ pInfo := task.ProcessInfo{
+ Pid: pid,
+ }
+ for _, p := range s.processes {
+ if p.Pid() == int(pid) {
+ d := &runctypes.ProcessDetails{
+ ExecID: p.ID(),
+ }
+ a, err := typeurl.MarshalAny(d)
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err)
+ }
+ pInfo.Info = a
+ break
+ }
+ }
+ processes = append(processes, &pInfo)
+ }
+ return &taskAPI.PidsResponse{
+ Processes: processes,
+ }, nil
+}
+
+// CloseIO closes the I/O context of a process.
+func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ if stdin := p.Stdin(); stdin != nil {
+ if err := stdin.Close(); err != nil {
+ return nil, fmt.Errorf("close stdin: %w", err)
+ }
+ }
+ return empty, nil
+}
+
+// Checkpoint checkpoints the container.
+func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Connect returns shim information such as the shim's pid.
+func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) {
+ var pid int
+ if s.task != nil {
+ pid = s.task.Pid()
+ }
+ return &taskAPI.ConnectResponse{
+ ShimPid: uint32(os.Getpid()),
+ TaskPid: uint32(pid),
+ }, nil
+}
+
+func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) {
+ s.cancel()
+ os.Exit(0)
+ return empty, nil
+}
+
+func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {
+ path, err := os.Getwd()
+ if err != nil {
+ return nil, err
+ }
+ ns, err := namespaces.NamespaceRequired(ctx)
+ if err != nil {
+ return nil, err
+ }
+ runtime, err := s.readRuntime(path)
+ if err != nil {
+ return nil, err
+ }
+ rs := proc.NewRunsc(s.opts.Root, path, ns, runtime, nil)
+ stats, err := rs.Stats(ctx, s.id)
+ if err != nil {
+ return nil, err
+ }
+
+ // gvisor currently (as of 2020-03-03) only returns the total memory
+ // usage and current PID value[0]. However, we copy the common fields here
+ // so that future updates will propagate correct information. We're
+ // using the cgroups.Metrics structure so we're returning the same type
+ // as runc.
+ //
+ // [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81
+ data, err := typeurl.MarshalAny(&cgroups.Metrics{
+ CPU: &cgroups.CPUStat{
+ Usage: &cgroups.CPUUsage{
+ Total: stats.Cpu.Usage.Total,
+ Kernel: stats.Cpu.Usage.Kernel,
+ User: stats.Cpu.Usage.User,
+ PerCPU: stats.Cpu.Usage.Percpu,
+ },
+ Throttling: &cgroups.Throttle{
+ Periods: stats.Cpu.Throttling.Periods,
+ ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods,
+ ThrottledTime: stats.Cpu.Throttling.ThrottledTime,
+ },
+ },
+ Memory: &cgroups.MemoryStat{
+ Cache: stats.Memory.Cache,
+ Usage: &cgroups.MemoryEntry{
+ Limit: stats.Memory.Usage.Limit,
+ Usage: stats.Memory.Usage.Usage,
+ Max: stats.Memory.Usage.Max,
+ Failcnt: stats.Memory.Usage.Failcnt,
+ },
+ Swap: &cgroups.MemoryEntry{
+ Limit: stats.Memory.Swap.Limit,
+ Usage: stats.Memory.Swap.Usage,
+ Max: stats.Memory.Swap.Max,
+ Failcnt: stats.Memory.Swap.Failcnt,
+ },
+ Kernel: &cgroups.MemoryEntry{
+ Limit: stats.Memory.Kernel.Limit,
+ Usage: stats.Memory.Kernel.Usage,
+ Max: stats.Memory.Kernel.Max,
+ Failcnt: stats.Memory.Kernel.Failcnt,
+ },
+ KernelTCP: &cgroups.MemoryEntry{
+ Limit: stats.Memory.KernelTCP.Limit,
+ Usage: stats.Memory.KernelTCP.Usage,
+ Max: stats.Memory.KernelTCP.Max,
+ Failcnt: stats.Memory.KernelTCP.Failcnt,
+ },
+ },
+ Pids: &cgroups.PidsStat{
+ Current: stats.Pids.Current,
+ Limit: stats.Pids.Limit,
+ },
+ })
+ if err != nil {
+ return nil, err
+ }
+ return &taskAPI.StatsResponse{
+ Stats: data,
+ }, nil
+}
+
+// Update updates a running container.
+func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) {
+ return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
+}
+
+// Wait waits for a process to exit.
+func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) {
+ p, err := s.getProcess(r.ExecID)
+ if err != nil {
+ return nil, err
+ }
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
+ }
+ p.Wait()
+
+ return &taskAPI.WaitResponse{
+ ExitStatus: uint32(p.ExitStatus()),
+ ExitedAt: p.ExitedAt(),
+ }, nil
+}
+
+func (s *service) processExits() {
+ for e := range s.ec {
+ s.checkProcesses(e)
+ }
+}
+
+func (s *service) checkProcesses(e proc.Exit) {
+ // TODO(random-liu): Add `shouldKillAll` logic if container pid
+ // namespace is supported.
+ for _, p := range s.allProcesses() {
+ if p.ID() == e.ID {
+ if ip, ok := p.(*proc.Init); ok {
+ // Ensure all children are killed.
+ if err := ip.KillAll(s.context); err != nil {
+ log.G(s.context).WithError(err).WithField("id", ip.ID()).
+ Error("failed to kill init's children")
+ }
+ }
+ p.SetExited(e.Status)
+ s.events <- &events.TaskExit{
+ ContainerID: s.id,
+ ID: p.ID(),
+ Pid: uint32(p.Pid()),
+ ExitStatus: uint32(e.Status),
+ ExitedAt: p.ExitedAt(),
+ }
+ return
+ }
+ }
+}
+
+func (s *service) allProcesses() (o []process.Process) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ for _, p := range s.processes {
+ o = append(o, p)
+ }
+ if s.task != nil {
+ o = append(o, s.task)
+ }
+ return o
+}
+
+func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, error) {
+ s.mu.Lock()
+ p := s.task
+ s.mu.Unlock()
+ if p == nil {
+ return nil, fmt.Errorf("container must be created: %w", errdefs.ErrFailedPrecondition)
+ }
+ ps, err := p.(*proc.Init).Runtime().Ps(ctx, id)
+ if err != nil {
+ return nil, err
+ }
+ pids := make([]uint32, 0, len(ps))
+ for _, pid := range ps {
+ pids = append(pids, uint32(pid))
+ }
+ return pids, nil
+}
+
+func (s *service) forward(publisher shim.Publisher) {
+ for e := range s.events {
+ ctx, cancel := context.WithTimeout(s.context, 5*time.Second)
+ err := publisher.Publish(ctx, getTopic(e), e)
+ cancel()
+ if err != nil {
+ // Should not happen.
+ panic(fmt.Errorf("post event: %w", err))
+ }
+ }
+}
+
+func (s *service) getProcess(execID string) (process.Process, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ if execID == "" {
+ return s.task, nil
+ }
+ p := s.processes[execID]
+ if p == nil {
+ return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID)
+ }
+ return p, nil
+}
+
+func getTopic(e interface{}) string {
+ switch e.(type) {
+ case *events.TaskCreate:
+ return runtime.TaskCreateEventTopic
+ case *events.TaskStart:
+ return runtime.TaskStartEventTopic
+ case *events.TaskOOM:
+ return runtime.TaskOOMEventTopic
+ case *events.TaskExit:
+ return runtime.TaskExitEventTopic
+ case *events.TaskDelete:
+ return runtime.TaskDeleteEventTopic
+ case *events.TaskExecAdded:
+ return runtime.TaskExecAddedEventTopic
+ case *events.TaskExecStarted:
+ return runtime.TaskExecStartedEventTopic
+ default:
+ log.L.Printf("no topic for type %#v", e)
+ }
+ return runtime.TaskUnknownTopic
+}
+
+func newInit(ctx context.Context, path, workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *options.Options, rootfs string) (*proc.Init, error) {
+ spec, err := utils.ReadSpec(r.Bundle)
+ if err != nil {
+ return nil, fmt.Errorf("read oci spec: %w", err)
+ }
+ if err := utils.UpdateVolumeAnnotations(r.Bundle, spec); err != nil {
+ return nil, fmt.Errorf("update volume annotations: %w", err)
+ }
+ runsc.FormatLogPath(r.ID, options.RunscConfig)
+ runtime := proc.NewRunsc(options.Root, path, namespace, options.BinaryName, options.RunscConfig)
+ p := proc.New(r.ID, runtime, stdio.Stdio{
+ Stdin: r.Stdin,
+ Stdout: r.Stdout,
+ Stderr: r.Stderr,
+ Terminal: r.Terminal,
+ })
+ p.Bundle = r.Bundle
+ p.Platform = platform
+ p.Rootfs = rootfs
+ p.WorkDir = workDir
+ p.IoUID = int(options.IoUid)
+ p.IoGID = int(options.IoGid)
+ p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox
+ p.UserLog = utils.UserLogPath(spec)
+ p.Monitor = reaper.Default
+ return p, nil
+}
diff --git a/pkg/shim/v2/service_linux.go b/pkg/shim/v2/service_linux.go
new file mode 100644
index 000000000..1800ab90b
--- /dev/null
+++ b/pkg/shim/v2/service_linux.go
@@ -0,0 +1,108 @@
+// Copyright 2018 The containerd Authors.
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package v2
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "sync"
+ "syscall"
+
+ "github.com/containerd/console"
+ "github.com/containerd/fifo"
+)
+
+type linuxPlatform struct {
+ epoller *console.Epoller
+}
+
+func (p *linuxPlatform) CopyConsole(ctx context.Context, console console.Console, stdin, stdout, stderr string, wg *sync.WaitGroup) (console.Console, error) {
+ if p.epoller == nil {
+ return nil, fmt.Errorf("uninitialized epoller")
+ }
+
+ epollConsole, err := p.epoller.Add(console)
+ if err != nil {
+ return nil, err
+ }
+
+ if stdin != "" {
+ in, err := fifo.OpenFifo(context.Background(), stdin, syscall.O_RDONLY|syscall.O_NONBLOCK, 0)
+ if err != nil {
+ return nil, err
+ }
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ io.CopyBuffer(epollConsole, in, *p)
+ }()
+ }
+
+ outw, err := fifo.OpenFifo(ctx, stdout, syscall.O_WRONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ outr, err := fifo.OpenFifo(ctx, stdout, syscall.O_RDONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ wg.Add(1)
+ go func() {
+ p := bufPool.Get().(*[]byte)
+ defer bufPool.Put(p)
+ io.CopyBuffer(outw, epollConsole, *p)
+ epollConsole.Close()
+ outr.Close()
+ outw.Close()
+ wg.Done()
+ }()
+ return epollConsole, nil
+}
+
+func (p *linuxPlatform) ShutdownConsole(ctx context.Context, cons console.Console) error {
+ if p.epoller == nil {
+ return fmt.Errorf("uninitialized epoller")
+ }
+ epollConsole, ok := cons.(*console.EpollConsole)
+ if !ok {
+ return fmt.Errorf("expected EpollConsole, got %#v", cons)
+ }
+ return epollConsole.Shutdown(p.epoller.CloseConsole)
+}
+
+func (p *linuxPlatform) Close() error {
+ return p.epoller.Close()
+}
+
+// initialize a single epoll fd to manage our consoles. `initPlatform` should
+// only be called once.
+func (s *service) initPlatform() error {
+ if s.platform != nil {
+ return nil
+ }
+ epoller, err := console.NewEpoller()
+ if err != nil {
+ return fmt.Errorf("failed to initialize epoller: %w", err)
+ }
+ s.platform = &linuxPlatform{
+ epoller: epoller,
+ }
+ go epoller.Wait()
+ return nil
+}