diff options
Diffstat (limited to 'runsc')
50 files changed, 8348 insertions, 0 deletions
diff --git a/runsc/BUILD b/runsc/BUILD new file mode 100644 index 000000000..3651c2d30 --- /dev/null +++ b/runsc/BUILD @@ -0,0 +1,17 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_binary") + +go_binary( + name = "runsc", + srcs = [ + "main.go", + ], + pure = "on", + deps = [ + "//pkg/log", + "//runsc/boot", + "//runsc/cmd", + "@com_github_google_subcommands//:go_default_library", + ], +) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD new file mode 100644 index 000000000..88736cfa4 --- /dev/null +++ b/runsc/boot/BUILD @@ -0,0 +1,88 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "boot", + srcs = [ + "capability.go", + "config.go", + "controller.go", + "events.go", + "fds.go", + "fs.go", + "limits.go", + "loader.go", + "network.go", + "strace.go", + ], + importpath = "gvisor.googlesource.com/gvisor/runsc/boot", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/control/server", + "//pkg/cpuid", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/sentry/control", + "//pkg/sentry/fs", + "//pkg/sentry/fs/dev", + "//pkg/sentry/fs/gofer", + "//pkg/sentry/fs/host", + "//pkg/sentry/fs/proc", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fs/sys", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/sighandling", + "//pkg/sentry/socket/epsocket", + "//pkg/sentry/socket/hostinet", + "//pkg/sentry/socket/netlink", + "//pkg/sentry/socket/netlink/route", + "//pkg/sentry/socket/unix", + "//pkg/sentry/strace", + "//pkg/sentry/syscalls/linux", + "//pkg/sentry/time", + "//pkg/sentry/usage", + "//pkg/sentry/watchdog", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/link/fdbased", + "//pkg/tcpip/link/loopback", + "//pkg/tcpip/link/sniffer", + "//pkg/tcpip/network/arp", + "//pkg/tcpip/network/ipv4", + "//pkg/tcpip/network/ipv6", + "//pkg/tcpip/stack", + "//pkg/tcpip/transport/tcp", + "//pkg/tcpip/transport/udp", + "//pkg/urpc", + "//runsc/boot/filter", + "//runsc/specutils", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_syndtr_gocapability//capability:go_default_library", + ], +) + +go_test( + name = "boot_test", + size = "small", + srcs = ["loader_test.go"], + embed = [":boot"], + deps = [ + "//pkg/control/server", + "//pkg/log", + "//pkg/sentry/context/contexttest", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) diff --git a/runsc/boot/capability.go b/runsc/boot/capability.go new file mode 100644 index 000000000..4c6a59245 --- /dev/null +++ b/runsc/boot/capability.go @@ -0,0 +1,120 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "os" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/syndtr/gocapability/capability" +) + +// ApplyCaps applies the capabilities in the spec to the current thread. +// +// Note that it must be called with current thread locked. +func ApplyCaps(conf *Config, caps *specs.LinuxCapabilities) error { + setter, err := capability.NewPid2(os.Getpid()) + if err != nil { + return err + } + + bounding, err := capsFromNames(caps.Bounding) + if err != nil { + return err + } + effective, err := capsFromNames(caps.Effective) + if err != nil { + return err + } + permitted, err := capsFromNames(caps.Permitted) + if err != nil { + return err + } + inheritable, err := capsFromNames(caps.Inheritable) + if err != nil { + return err + } + ambient, err := capsFromNames(caps.Ambient) + if err != nil { + return err + } + + // Ptrace platform requires extra capabilities. + if conf.Platform == PlatformPtrace { + bounding = append(bounding, capability.CAP_SYS_PTRACE) + effective = append(effective, capability.CAP_SYS_PTRACE) + permitted = append(permitted, capability.CAP_SYS_PTRACE) + } + + setter.Set(capability.BOUNDS, bounding...) + setter.Set(capability.PERMITTED, permitted...) + setter.Set(capability.INHERITABLE, inheritable...) + setter.Set(capability.EFFECTIVE, effective...) + setter.Set(capability.AMBIENT, ambient...) + return setter.Apply(capability.CAPS | capability.BOUNDS | capability.AMBS) +} + +func capsFromNames(names []string) ([]capability.Cap, error) { + var caps []capability.Cap + for _, name := range names { + cap, ok := capFromName[name] + if !ok { + return nil, fmt.Errorf("invalid capability %q", name) + } + caps = append(caps, cap) + } + return caps, nil +} + +var capFromName = map[string]capability.Cap{ + "CAP_CHOWN": capability.CAP_CHOWN, + "CAP_DAC_OVERRIDE": capability.CAP_DAC_OVERRIDE, + "CAP_DAC_READ_SEARCH": capability.CAP_DAC_READ_SEARCH, + "CAP_FOWNER": capability.CAP_FOWNER, + "CAP_FSETID": capability.CAP_FSETID, + "CAP_KILL": capability.CAP_KILL, + "CAP_SETGID": capability.CAP_SETGID, + "CAP_SETUID": capability.CAP_SETUID, + "CAP_SETPCAP": capability.CAP_SETPCAP, + "CAP_LINUX_IMMUTABLE": capability.CAP_LINUX_IMMUTABLE, + "CAP_NET_BIND_SERVICE": capability.CAP_NET_BIND_SERVICE, + "CAP_NET_BROAD_CAST": capability.CAP_NET_BROADCAST, + "CAP_NET_ADMIN": capability.CAP_NET_ADMIN, + "CAP_NET_RAW": capability.CAP_NET_RAW, + "CAP_IPC_LOCK": capability.CAP_IPC_LOCK, + "CAP_IPC_OWNER": capability.CAP_IPC_OWNER, + "CAP_SYS_MODULE": capability.CAP_SYS_MODULE, + "CAP_SYS_RAWIO": capability.CAP_SYS_RAWIO, + "CAP_SYS_CHROOT": capability.CAP_SYS_CHROOT, + "CAP_SYS_PTRACE": capability.CAP_SYS_PTRACE, + "CAP_SYS_PACCT": capability.CAP_SYS_PACCT, + "CAP_SYS_ADMIN": capability.CAP_SYS_ADMIN, + "CAP_SYS_BOOT": capability.CAP_SYS_BOOT, + "CAP_SYS_NICE": capability.CAP_SYS_NICE, + "CAP_SYS_RESOURCE": capability.CAP_SYS_RESOURCE, + "CAP_SYS_TIME": capability.CAP_SYS_TIME, + "CAP_SYS_TTY_CONFIG": capability.CAP_SYS_TTY_CONFIG, + "CAP_MKNOD": capability.CAP_MKNOD, + "CAP_LEASE": capability.CAP_LEASE, + "CAP_AUDIT_WRITE": capability.CAP_AUDIT_WRITE, + "CAP_AUDIT_CONTROL": capability.CAP_AUDIT_CONTROL, + "CAP_SETFCAP": capability.CAP_SETFCAP, + "CAP_MAC_OVERRIDE": capability.CAP_MAC_OVERRIDE, + "CAP_MAC_ADMIN": capability.CAP_MAC_ADMIN, + "CAP_SYSLOG": capability.CAP_SYSLOG, + "CAP_WAKE_ALARM": capability.CAP_WAKE_ALARM, + "CAP_BLOCK_SUSPEND": capability.CAP_BLOCK_SUSPEND, +} diff --git a/runsc/boot/config.go b/runsc/boot/config.go new file mode 100644 index 000000000..f3e33e89a --- /dev/null +++ b/runsc/boot/config.go @@ -0,0 +1,162 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import "fmt" + +// PlatformType tells which platform to use. +type PlatformType int + +const ( + // Ptrace runs the sandbox with the ptrace platform. + PlatformPtrace PlatformType = iota + + // KVM runs the sandbox with the KVM platform. + PlatformKVM +) + +// MakePlatformType converts type from string. +func MakePlatformType(s string) (PlatformType, error) { + switch s { + case "ptrace": + return PlatformPtrace, nil + case "kvm": + return PlatformKVM, nil + default: + return 0, fmt.Errorf("invalid platform type %q", s) + } +} + +func (p PlatformType) String() string { + switch p { + case PlatformPtrace: + return "ptrace" + case PlatformKVM: + return "kvm" + default: + return fmt.Sprintf("unknown(%d)", p) + } +} + +// FileAccessType tells how the filesystem is accessed. +type FileAccessType int + +const ( + // FileAccessProxy sends IO requests to a Gofer process that validates the + // requests and forwards them to the host. + FileAccessProxy FileAccessType = iota + + // FileAccessDirect connects the sandbox directly to the host filesystem. + FileAccessDirect +) + +// MakeFileAccessType converts type from string. +func MakeFileAccessType(s string) (FileAccessType, error) { + switch s { + case "proxy": + return FileAccessProxy, nil + case "direct": + return FileAccessDirect, nil + default: + return 0, fmt.Errorf("invalid file access type %q", s) + } +} + +func (f FileAccessType) String() string { + switch f { + case FileAccessProxy: + return "proxy" + case FileAccessDirect: + return "direct" + default: + return fmt.Sprintf("unknown(%d)", f) + } +} + +// NetworkType tells which network stack to use. +type NetworkType int + +const ( + // NetworkSandbox uses internal network stack, isolated from the host. + NetworkSandbox NetworkType = iota + + // NetworkHost redirects network related syscalls to the host network. + NetworkHost + + // NetworkNone sets up just loopback using netstack. + NetworkNone +) + +// MakeNetworkType converts type from string. +func MakeNetworkType(s string) (NetworkType, error) { + switch s { + case "sandbox": + return NetworkSandbox, nil + case "host": + return NetworkHost, nil + case "none": + return NetworkNone, nil + default: + return 0, fmt.Errorf("invalid network type %q", s) + } +} + +func (n NetworkType) String() string { + switch n { + case NetworkSandbox: + return "sandbox" + case NetworkHost: + return "host" + case NetworkNone: + return "none" + default: + return fmt.Sprintf("unknown(%d)", n) + } +} + +// Config holds configuration that is not part of the runtime spec. +type Config struct { + // RootDir is the runtime root directory. + RootDir string + + // FileAccess indicates how the filesystem is accessed. + FileAccess FileAccessType + + // Overlay is whether to wrap the root filesystem in an overlay. + Overlay bool + + // Network indicates what type of network to use. + Network NetworkType + + // LogPackets indicates that all network packets should be logged. + LogPackets bool + + // Platform is the platform to run on. + Platform PlatformType + + // Strace indicates that strace should be enabled. + Strace bool + + // StraceSyscalls is the set of syscalls to trace. If StraceEnable is + // true and this list is empty, then all syscalls will be traced. + StraceSyscalls []string + + // StraceLogSize is the max size of data blobs to display. + StraceLogSize uint + + // DisableSeccomp indicates whether seccomp syscall filters should be + // disabled. Pardon the double negation, but default to enabled is important. + DisableSeccomp bool +} diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go new file mode 100644 index 000000000..4d4ef7256 --- /dev/null +++ b/runsc/boot/controller.go @@ -0,0 +1,128 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" +) + +const ( + // ApplicationStart is the URPC endpoint for starting a sandboxed app. + ApplicationStart = "application.Start" + + // ApplicationProcesses is the URPC endpoint for getting the list of + // processes running in a sandbox. + ApplicationProcesses = "application.Processes" + + // ApplicationExecute is the URPC endpoint for executing a command in a + // sandbox. + ApplicationExecute = "application.Execute" + + // ApplicationEvent is the URPC endpoint for getting stats about the + // container used by "runsc events". + ApplicationEvent = "application.Event" + + // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links + // and routes in a network stack. + NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" +) + +// ControlSocketAddr generates an abstract unix socket name for the given id. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-sandbox.%s", id) +} + +// controller holds the control server, and is used for communication into the +// sandbox. +type controller struct { + // srv is the contorl server. + srv *server.Server + + // app holds the application methods. + app *application +} + +// newController creates a new controller and starts it listening. +func newController(fd int, k *kernel.Kernel) (*controller, error) { + srv, err := server.CreateFromFD(fd) + if err != nil { + return nil, err + } + + app := &application{ + startChan: make(chan struct{}), + startResultChan: make(chan error, 1), + k: k, + } + srv.Register(app) + + if eps, ok := k.NetworkStack().(*epsocket.Stack); ok { + net := &Network{ + Stack: eps.Stack, + } + srv.Register(net) + } + + if err := srv.StartServing(); err != nil { + return nil, err + } + + return &controller{ + srv: srv, + app: app, + }, nil +} + +// application contains methods that control the sandboxed application. +type application struct { + // startChan is used to signal when the application process should be + // started. + startChan chan struct{} + + // startResultChan is used to signal when the application has started. Any + // errors encountered during startup will be sent to the channel. A nil value + // indicates success. + startResultChan chan error + + // k is the emulated linux kernel on which the sandboxed + // application runs. + k *kernel.Kernel +} + +// Start will start the application process. +func (a *application) Start(_, _ *struct{}) error { + // Tell the application to start and wait for the result. + a.startChan <- struct{}{} + return <-a.startResultChan +} + +// Processes retrieves information about processes running in the sandbox. +func (a *application) Processes(_, out *[]*control.Process) error { + return control.Processes(a.k, out) +} + +// Execute runs a command on a created or running sandbox. +func (a *application) Execute(e *control.ExecArgs, waitStatus *uint32) error { + proc := control.Proc{Kernel: a.k} + if err := proc.Exec(e, waitStatus); err != nil { + return fmt.Errorf("error executing: %+v: %v", e, err) + } + return nil +} diff --git a/runsc/boot/events.go b/runsc/boot/events.go new file mode 100644 index 000000000..ef6459b01 --- /dev/null +++ b/runsc/boot/events.go @@ -0,0 +1,81 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" +) + +// Event struct for encoding the event data to JSON. Corresponds to runc's +// main.event struct. +type Event struct { + Type string `json:"type"` + ID string `json:"id"` + Data interface{} `json:"data,omitempty"` +} + +// Stats is the runc specific stats structure for stability when encoding and +// decoding stats. +// TODO: Many fields aren't obtainable due to a lack of cgroups. +type Stats struct { + Memory Memory `json:"memory"` + Pids Pids `json:"pids"` +} + +// Pids contains stats on processes. +type Pids struct { + Current uint64 `json:"current,omitempty"` + Limit uint64 `json:"limit,omitempty"` +} + +// MemoryEntry contains stats on a kind of memory. +type MemoryEntry struct { + Limit uint64 `json:"limit"` + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +// Memory contains stats on memory. +type Memory struct { + Cache uint64 `json:"cache,omitempty"` + Usage MemoryEntry `json:"usage,omitempty"` + Swap MemoryEntry `json:"swap,omitempty"` + Kernel MemoryEntry `json:"kernel,omitempty"` + KernelTCP MemoryEntry `json:"kernelTCP,omitempty"` + Raw map[string]uint64 `json:"raw,omitempty"` +} + +func (a *application) Event(_ *struct{}, out *Event) error { + stats := &Stats{} + stats.populateMemory(a.k) + stats.populatePIDs(a.k) + *out = Event{Type: "stats", Data: stats} + return nil +} + +func (s *Stats) populateMemory(k *kernel.Kernel) { + mem := k.Platform.Memory() + mem.UpdateUsage() + _, totalUsage := usage.MemoryAccounting.Copy() + s.Memory.Usage = MemoryEntry{ + Usage: totalUsage, + } +} + +func (s *Stats) populatePIDs(k *kernel.Kernel) { + s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups())) +} diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go new file mode 100644 index 000000000..0449e243d --- /dev/null +++ b/runsc/boot/fds.go @@ -0,0 +1,61 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" +) + +// createFDMap creates an fd map that contains stdin, stdout, and stderr. If +// console is true, then ioctl calls will be passed through to the host fd. +// +// TODO: We currently arn't passing any FDs in to the sandbox, so +// there's not much else for this function to do. It will get more complicated +// when gofers enter the picture. Also the LISTEN_FDS environment variable +// allows passing arbitrary FDs to the sandbox, which we do not yet support. +func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool) (*kernel.FDMap, error) { + fdm := k.NewFDMap() + defer fdm.DecRef() + + // Maps sandbox fd to host fd. + fdMap := map[int]int{ + 0: syscall.Stdin, + 1: syscall.Stdout, + 2: syscall.Stderr, + } + mounter := fs.FileOwnerFromContext(ctx) + + for sfd, hfd := range fdMap { + file, err := host.ImportFile(ctx, hfd, mounter, console /* allow ioctls */) + if err != nil { + return nil, fmt.Errorf("failed to import fd %d: %v", hfd, err) + } + defer file.DecRef() + if err := fdm.NewFDAt(kdefs.FD(sfd), file, kernel.FDFlags{}, l); err != nil { + return nil, fmt.Errorf("failed to add imported fd %d to FDMap: %v", hfd, err) + } + } + + fdm.IncRef() + return fdm, nil +} diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD new file mode 100644 index 000000000..fd1b18717 --- /dev/null +++ b/runsc/boot/filter/BUILD @@ -0,0 +1,26 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "filter", + srcs = [ + "config.go", + "extra_filters.go", + "extra_filters_msan.go", + "extra_filters_race.go", + "filter.go", + ], + importpath = "gvisor.googlesource.com/gvisor/runsc/boot/filter", + visibility = [ + "//runsc/boot:__subpackages__", + ], + deps = [ + "//pkg/log", + "//pkg/seccomp", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go new file mode 100644 index 000000000..130e987df --- /dev/null +++ b/runsc/boot/filter/config.go @@ -0,0 +1,175 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "syscall" + + "golang.org/x/sys/unix" +) + +// allowedSyscalls is the set of syscalls executed by the Sentry +// to the host OS. +var allowedSyscalls = []uintptr{ + syscall.SYS_ACCEPT, + syscall.SYS_ARCH_PRCTL, + syscall.SYS_CLOCK_GETTIME, + syscall.SYS_CLONE, + syscall.SYS_CLOSE, + syscall.SYS_DUP, + syscall.SYS_DUP2, + syscall.SYS_EPOLL_CREATE1, + syscall.SYS_EPOLL_CTL, + syscall.SYS_EPOLL_PWAIT, + syscall.SYS_EPOLL_WAIT, + syscall.SYS_EVENTFD2, + syscall.SYS_EXIT, + syscall.SYS_EXIT_GROUP, + syscall.SYS_FALLOCATE, + syscall.SYS_FCHMOD, + syscall.SYS_FCNTL, + syscall.SYS_FSTAT, + syscall.SYS_FSYNC, + syscall.SYS_FTRUNCATE, + syscall.SYS_FUTEX, + syscall.SYS_GETDENTS64, + syscall.SYS_GETPID, + unix.SYS_GETRANDOM, + syscall.SYS_GETSOCKOPT, + syscall.SYS_GETTID, + syscall.SYS_GETTIMEOFDAY, + syscall.SYS_LISTEN, + syscall.SYS_LSEEK, + syscall.SYS_MADVISE, + syscall.SYS_MINCORE, + syscall.SYS_MMAP, + syscall.SYS_MPROTECT, + syscall.SYS_MUNMAP, + syscall.SYS_NEWFSTATAT, + syscall.SYS_POLL, + syscall.SYS_PREAD64, + syscall.SYS_PSELECT6, + syscall.SYS_PWRITE64, + syscall.SYS_READ, + syscall.SYS_READLINKAT, + syscall.SYS_READV, + syscall.SYS_RECVMSG, + syscall.SYS_RENAMEAT, + syscall.SYS_RESTART_SYSCALL, + syscall.SYS_RT_SIGACTION, + syscall.SYS_RT_SIGPROCMASK, + syscall.SYS_RT_SIGRETURN, + syscall.SYS_SCHED_YIELD, + syscall.SYS_SENDMSG, + syscall.SYS_SETITIMER, + syscall.SYS_SHUTDOWN, + syscall.SYS_SIGALTSTACK, + syscall.SYS_SYNC_FILE_RANGE, + syscall.SYS_TGKILL, + syscall.SYS_UTIMENSAT, + syscall.SYS_WRITE, + syscall.SYS_WRITEV, +} + +// TODO: Ioctl is needed in order to support tty consoles. +// Once filters support argument-checking, we should only allow ioctl +// with tty-related arguments. +func consoleFilters() []uintptr { + return []uintptr{ + syscall.SYS_IOCTL, + } +} + +// whitelistFSFilters returns syscalls made by whitelistFS. Using WhitelistFS +// is less secure because it runs inside the Sentry and must be able to perform +// file operations that would otherwise be disabled by seccomp when a Gofer is +// used. When whitelistFS is not used, openning new FD in the Sentry is +// disallowed. +func whitelistFSFilters() []uintptr { + return []uintptr{ + syscall.SYS_ACCESS, + syscall.SYS_FCHMOD, + syscall.SYS_FSTAT, + syscall.SYS_FSYNC, + syscall.SYS_FTRUNCATE, + syscall.SYS_GETCWD, + syscall.SYS_GETDENTS, + syscall.SYS_GETDENTS64, + syscall.SYS_LSEEK, + syscall.SYS_LSTAT, + syscall.SYS_MKDIR, + syscall.SYS_MKDIRAT, + syscall.SYS_NEWFSTATAT, + syscall.SYS_OPEN, + syscall.SYS_OPENAT, + syscall.SYS_PREAD64, + syscall.SYS_PWRITE64, + syscall.SYS_READ, + syscall.SYS_READLINK, + syscall.SYS_READLINKAT, + syscall.SYS_RENAMEAT, + syscall.SYS_STAT, + syscall.SYS_SYMLINK, + syscall.SYS_SYMLINKAT, + syscall.SYS_SYNC_FILE_RANGE, + syscall.SYS_UNLINK, + syscall.SYS_UNLINKAT, + syscall.SYS_UTIMENSAT, + syscall.SYS_WRITE, + } +} + +// hostInetFilters contains syscalls that are needed by sentry/socket/hostinet. +func hostInetFilters() []uintptr { + return []uintptr{ + syscall.SYS_ACCEPT4, + syscall.SYS_BIND, + syscall.SYS_CONNECT, + syscall.SYS_GETPEERNAME, + syscall.SYS_GETSOCKNAME, + syscall.SYS_GETSOCKOPT, + syscall.SYS_IOCTL, + syscall.SYS_LISTEN, + syscall.SYS_READV, + syscall.SYS_RECVFROM, + syscall.SYS_RECVMSG, + syscall.SYS_SENDMSG, + syscall.SYS_SENDTO, + syscall.SYS_SETSOCKOPT, + syscall.SYS_SHUTDOWN, + syscall.SYS_SOCKET, + syscall.SYS_WRITEV, + } +} + +// ptraceFilters returns syscalls made exclusively by the ptrace platform. +func ptraceFilters() []uintptr { + return []uintptr{ + syscall.SYS_PTRACE, + syscall.SYS_WAIT4, + unix.SYS_GETCPU, + unix.SYS_SCHED_SETAFFINITY, + } +} + +// kvmFilters returns syscalls made exclusively by the KVM platform. +func kvmFilters() []uintptr { + return []uintptr{ + syscall.SYS_IOCTL, + syscall.SYS_RT_SIGSUSPEND, + syscall.SYS_RT_SIGTIMEDWAIT, + 0xffffffffffffffff, // KVM uses syscall -1 to transition to host. + } +} diff --git a/runsc/boot/filter/extra_filters.go b/runsc/boot/filter/extra_filters.go new file mode 100644 index 000000000..e10d9bf4c --- /dev/null +++ b/runsc/boot/filter/extra_filters.go @@ -0,0 +1,24 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !msan,!race + +package filter + +// instrumentationFilters returns additional filters for syscalls used by +// Go intrumentation tools, e.g. -race, -msan. +// Returns empty when disabled. +func instrumentationFilters() []uintptr { + return nil +} diff --git a/runsc/boot/filter/extra_filters_msan.go b/runsc/boot/filter/extra_filters_msan.go new file mode 100644 index 000000000..a862340f6 --- /dev/null +++ b/runsc/boot/filter/extra_filters_msan.go @@ -0,0 +1,30 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build msan + +package filter + +import ( + "syscall" +) + +// instrumentationFilters returns additional filters for syscalls used by MSAN. +func instrumentationFilters() []uintptr { + Report("MSAN is enabled: syscall filters less restrictive!") + return []uintptr{ + syscall.SYS_SCHED_GETAFFINITY, + syscall.SYS_SET_ROBUST_LIST, + } +} diff --git a/runsc/boot/filter/extra_filters_race.go b/runsc/boot/filter/extra_filters_race.go new file mode 100644 index 000000000..b0c74a58a --- /dev/null +++ b/runsc/boot/filter/extra_filters_race.go @@ -0,0 +1,33 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package filter + +import ( + "syscall" +) + +// instrumentationFilters returns additional filters for syscalls used by TSAN. +func instrumentationFilters() []uintptr { + Report("TSAN is enabled: syscall filters less restrictive!") + return []uintptr{ + syscall.SYS_BRK, + syscall.SYS_MUNLOCK, + syscall.SYS_NANOSLEEP, + syscall.SYS_OPEN, + syscall.SYS_SET_ROBUST_LIST, + } +} diff --git a/runsc/boot/filter/filter.go b/runsc/boot/filter/filter.go new file mode 100644 index 000000000..3ba56a318 --- /dev/null +++ b/runsc/boot/filter/filter.go @@ -0,0 +1,67 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package filter defines all syscalls the sandbox is allowed to make +// to the host, and installs seccomp filters to prevent prohibited +// syscalls in case it's compromised. +package filter + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/seccomp" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" +) + +// Install installs seccomp filters for based on the given platform. +func Install(p platform.Platform, whitelistFS, console, hostNetwork bool) error { + s := allowedSyscalls + + // Set of additional filters used by -race and -msan. Returns empty + // when not enabled. + s = append(s, instrumentationFilters()...) + + if whitelistFS { + Report("direct file access allows unrestricted file access!") + s = append(s, whitelistFSFilters()...) + } + if console { + Report("console is enabled: syscall filters less restrictive!") + s = append(s, consoleFilters()...) + } + if hostNetwork { + Report("host networking enabled: syscall filters less restrictive!") + s = append(s, hostInetFilters()...) + } + + switch p := p.(type) { + case *ptrace.PTrace: + s = append(s, ptraceFilters()...) + case *kvm.KVM: + s = append(s, kvmFilters()...) + default: + return fmt.Errorf("unknown platform type %T", p) + } + + // TODO: Set kill=true when SECCOMP_RET_KILL_PROCESS is supported. + return seccomp.Install(s, false) +} + +// Report writes a warning message to the log. +func Report(msg string) { + log.Warningf("*** SECCOMP WARNING: %s", msg) +} diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go new file mode 100644 index 000000000..2073bd0b1 --- /dev/null +++ b/runsc/boot/fs.go @@ -0,0 +1,441 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +type fdDispenser struct { + fds []int +} + +func (f *fdDispenser) remove() int { + rv := f.fds[0] + f.fds = f.fds[1:] + return rv +} + +func (f *fdDispenser) empty() bool { + return len(f.fds) == 0 +} + +// createMountNamespace creates a mount manager containing the root filesystem +// and all mounts. +func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) { + fds := &fdDispenser{fds: ioFDs} + + // Create the MountNamespace from the root. + rootInode, err := createRootMount(ctx, spec, conf, fds) + if err != nil { + return nil, fmt.Errorf("failed to create root overlay: %v", err) + } + mns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + return nil, fmt.Errorf("failed to construct MountNamespace: %v", err) + } + + // Keep track of whether proc, sys, and tmp were mounted. + var procMounted, sysMounted, tmpMounted bool + + // Mount all submounts from the spec. + for _, m := range spec.Mounts { + // OCI spec uses many different mounts for the things inside of '/dev'. We + // have a single mount at '/dev' that is always mounted, regardless of + // whether it was asked for, as the spec says we SHOULD. + if strings.HasPrefix(m.Destination, "/dev") { + log.Warningf("ignoring dev mount at %q", m.Destination) + continue + } + switch m.Destination { + case "/proc": + procMounted = true + case "/sys": + sysMounted = true + case "/tmp": + tmpMounted = true + } + + if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil { + return nil, err + } + } + + // Always mount /dev. + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "devtmpfs", + Destination: "/dev", + }); err != nil { + return nil, err + } + + // Mount proc and sys even if the user did not ask for it, as the spec + // says we SHOULD. + if !procMounted { + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "proc", + Destination: "/proc", + }); err != nil { + return nil, err + } + } + if !sysMounted { + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "sysfs", + Destination: "/sys", + }); err != nil { + return nil, err + } + } + + // Technically we don't have to mount tmpfs at /tmp, as we could just + // rely on the host /tmp, but this is a nice optimization, and fixes + // some apps that call mknod in /tmp. + if !tmpMounted { + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "tmpfs", + Destination: "/tmp", + }); err != nil { + return nil, err + } + } + + if !fds.empty() { + return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds) + } + + return mns, nil +} + +// createRootMount creates the root filesystem. +func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) { + // First construct the filesystem from the spec.Root. + mf := fs.MountSourceFlags{ + ReadOnly: spec.Root.Readonly, + NoAtime: true, + } + + var ( + rootInode *fs.Inode + err error + ) + switch conf.FileAccess { + case FileAccessProxy: + fd := fds.remove() + log.Infof("Mounting root over 9P, ioFD: %d", fd) + hostFS := mustFindFilesystem("9p") + rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd)) + if err != nil { + return nil, fmt.Errorf("failed to generate root mount point: %v", err) + } + + case FileAccessDirect: + hostFS := mustFindFilesystem("whitelistfs") + rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true") + if err != nil { + return nil, fmt.Errorf("failed to generate root mount point: %v", err) + } + + default: + return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess) + } + + // We need to overlay the root on top of a ramfs with stub directories + // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always + // mounted even if they are not in the spec. + submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp") + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + if err != nil { + return nil, fmt.Errorf("error adding submount overlay: %v", err) + } + + if conf.Overlay { + log.Debugf("Adding overlay on top of root mount") + // Overlay a tmpfs filesystem on top of the root. + rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + if err != nil { + return nil, err + } + } + + log.Infof("Mounted %q to \"/\" type root", spec.Root.Path) + return rootInode, nil +} + +func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { + // Upper layer uses the same flags as lower, but it must be read-write. + lowerFlags.ReadOnly = false + + tmpFS := mustFindFilesystem("tmpfs") + upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "") + if err != nil { + return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err) + } + return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags) +} + +func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + var data []string + var fsName string + var useOverlay bool + switch m.Type { + case "proc", "sysfs", "devtmpfs": + fsName = m.Type + case "none": + fsName = "sysfs" + case "tmpfs": + fsName = m.Type + + // tmpfs has some extra supported options that we must pass through. + var err error + data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") + if err != nil { + return err + } + case "bind": + switch conf.FileAccess { + case FileAccessProxy: + fd := fds.remove() + fsName = "9p" + data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"} + case FileAccessDirect: + fsName = "whitelistfs" + data = []string{"root=" + m.Source, "dont_translate_ownership=true"} + default: + return fmt.Errorf("invalid file access type: %v", conf.FileAccess) + } + + fi, err := os.Stat(m.Source) + if err != nil { + return err + } + // Add overlay to all writable mounts, except when mapping an individual file. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir() + default: + // TODO: Support all the mount types and make this a + // fatal error. Most applications will "just work" without + // them, so this is a warning for now. + // we do not support. + log.Warningf("ignoring unknown filesystem type %q", m.Type) + return nil + } + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(m.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + mf.NoAtime = true + + inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ",")) + if err != nil { + return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err) + } + + // If there are submounts, we need to overlay the mount on top of a + // ramfs with stub directories for submount paths. + // + // We do not do this for /dev, since there will usually be submounts in + // the spec, but our devfs implementation contains all the necessary + // directories and files (well, most of them anyways). + if m.Destination != "/dev" { + submounts := subtargets(m.Destination, spec.Mounts) + if len(submounts) > 0 { + log.Infof("Adding submount overlay over %q", m.Destination) + inode, err = addSubmountOverlay(ctx, inode, submounts) + if err != nil { + return fmt.Errorf("error adding submount overlay: %v", err) + } + } + } + + if useOverlay { + log.Debugf("Adding overlay on top of mount %q", m.Destination) + if inode, err = addOverlay(ctx, conf, inode, m.Type, mf); err != nil { + return err + } + } + + root := mns.Root() + defer root.DecRef() + dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err) + } + defer dirent.DecRef() + if err := mns.Mount(ctx, dirent, inode); err != nil { + return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err) + } + + log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + return nil +} + +func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error { + root := mns.Root() + defer root.DecRef() + + // Starting at the root, walk the path. + parent := root + ps := strings.Split(filepath.Clean(path), string(filepath.Separator)) + for i := 0; i < len(ps); i++ { + if ps[i] == "" { + // This will be case for the first and last element, if the path + // begins or ends with '/'. Note that we always treat the path as + // absolute, regardless of what the first character contains. + continue + } + d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit) + if err == syserror.ENOENT { + // If we encounter a path that does not exist, then + // create it. + if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("failed to create directory %q: %v", ps[i], err) + } + if d, err = parent.Walk(ctx, root, ps[i]); err != nil { + return fmt.Errorf("walk to %q failed: %v", ps[i], err) + } + } else if err != nil { + return fmt.Errorf("failed to find inode %q: %v", ps[i], err) + } + parent = d + } + return nil +} + +// parseAndFilterOptions parses a MountOptions slice and filters by the allowed +// keys. +func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { + var out []string + for _, o := range opts { + kv := strings.Split(o, "=") + switch len(kv) { + case 1: + if contains(allowedKeys, o) { + out = append(out, o) + continue + } + log.Warningf("ignoring unsupported key %q", kv) + case 2: + if contains(allowedKeys, kv[0]) { + out = append(out, o) + continue + } + log.Warningf("ignoring unsupported key %q", kv[0]) + default: + return nil, fmt.Errorf("invalid option %q", o) + } + } + return out, nil +} + +func destinations(mounts []specs.Mount, extra ...string) []string { + var ds []string + for _, m := range mounts { + ds = append(ds, m.Destination) + } + return append(ds, extra...) +} + +func mountFlags(opts []string) fs.MountSourceFlags { + mf := fs.MountSourceFlags{} + for _, o := range opts { + switch o { + case "ro": + mf.ReadOnly = true + case "noatime": + mf.NoAtime = true + default: + log.Warningf("ignorning unknown mount option %q", o) + } + } + return mf +} + +func contains(strs []string, str string) bool { + for _, s := range strs { + if s == str { + return true + } + } + return false +} + +func mustFindFilesystem(name string) fs.Filesystem { + fs, ok := fs.FindFilesystem(name) + if !ok { + panic(fmt.Sprintf("could not find filesystem %q", name)) + } + return fs +} + +// addSubmountOverlay overlays the inode over a ramfs tree containing the given +// paths. +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { + // There is no real filesystem backing this ramfs tree, so we pass in + // "nil" here. + mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts) + if err != nil { + return nil, fmt.Errorf("error creating mount tree: %v", err) + } + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + if err != nil { + return nil, fmt.Errorf("failed to make mount overlay: %v", err) + } + return overlayInode, err +} + +// subtargets takes a set of Mounts and returns only the targets that are +// children of the given root. The returned paths are relative to the root. +func subtargets(root string, mnts []specs.Mount) []string { + r := filepath.Clean(root) + var targets []string + for _, mnt := range mnts { + t := filepath.Clean(mnt.Destination) + if strings.HasPrefix(t, r) { + // Make the mnt path relative to the root path. If the + // result is empty, then mnt IS the root mount, not a + // submount. We don't want to include those. + if t := strings.TrimPrefix(t, r); t != "" { + targets = append(targets, t) + } + } + } + return targets +} diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go new file mode 100644 index 000000000..ea72de8e9 --- /dev/null +++ b/runsc/boot/limits.go @@ -0,0 +1,60 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" +) + +// Mapping from linux resource names to limits.LimitType. +var fromLinuxResource = map[string]limits.LimitType{ + "RLIMIT_CPU": limits.CPU, + "RLIMIT_FSIZE": limits.FileSize, + "RLIMIT_DATA": limits.Data, + "RLIMIT_STACK": limits.Stack, + "RLIMIT_CORE": limits.Core, + "RLIMIT_RSS": limits.Rss, + "RLIMIT_NPROC": limits.ProcessCount, + "RLIMIT_NOFILE": limits.NumberOfFiles, + "RLIMIT_MEMLOCK": limits.MemoryPagesLocked, + "RLIMIT_AS": limits.AS, + "RLIMIT_LOCKS": limits.Locks, + "RLIMIT_SIGPENDING": limits.SignalsPending, + "RLIMIT_MSGQUEUE": limits.MessageQueueBytes, + "RLIMIT_NICE": limits.Nice, + "RLIMIT_RTPRIO": limits.RealTimePriority, + "RLIMIT_RTTIME": limits.Rttime, +} + +func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { + ls, err := limits.NewLinuxDistroLimitSet() + if err != nil { + return nil, err + } + for _, rl := range spec.Process.Rlimits { + lt, ok := fromLinuxResource[rl.Type] + if !ok { + return nil, fmt.Errorf("unknown resource %q", rl.Type) + } + ls.SetUnchecked(lt, limits.Limit{ + Cur: rl.Soft, + Max: rl.Hard, + }) + } + return ls, nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go new file mode 100644 index 000000000..a470cb054 --- /dev/null +++ b/runsc/boot/loader.go @@ -0,0 +1,354 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package boot loads the kernel and runs the application. +package boot + +import ( + "fmt" + "math/rand" + "sync/atomic" + "syscall" + gtime "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/loader" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" + "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling" + slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp" + "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4" + "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp" + "gvisor.googlesource.com/gvisor/runsc/boot/filter" + "gvisor.googlesource.com/gvisor/runsc/specutils" + + // Include supported socket providers. + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" +) + +// Loader keeps state needed to start the kernel and run the application. +type Loader struct { + // k is the kernel. + k *kernel.Kernel + + // ctrl is the control server. + ctrl *controller + + conf *Config + + // console is set to true if terminal is enabled. + console bool + + watchdog *watchdog.Watchdog + + // stopSignalForwarding disables forwarding of signals to the sandboxed + // app. It should be called when a sandbox is destroyed. + stopSignalForwarding func() + + // procArgs refers to the initial application task. + procArgs kernel.CreateProcessArgs +} + +func init() { + // Initialize the random number generator. + rand.Seed(gtime.Now().UnixNano()) + + // Register the global syscall table. + kernel.RegisterSyscallTable(slinux.AMD64) +} + +// New initializes a new kernel loader configured by spec. +func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) { + // Create kernel and platform. + p, err := createPlatform(conf) + if err != nil { + return nil, fmt.Errorf("error creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + + // Create VDSO. + vdso, err := loader.PrepareVDSO(p) + if err != nil { + return nil, fmt.Errorf("error creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("error creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + // Create initial limits. + ls, err := createLimitSet(spec) + if err != nil { + return nil, fmt.Errorf("error creating limits: %v", err) + } + + // Create capabilities. + caps, err := specutils.Capabilities(spec.Process.Capabilities) + if err != nil { + return nil, fmt.Errorf("error creating capabilities: %v", err) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) + for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + // Create credentials. + creds := auth.NewUserCredentials( + auth.KUID(spec.Process.User.UID), + auth.KGID(spec.Process.User.GID), + extraKGIDs, + caps, + auth.NewRootUserNamespace()) + if err != nil { + return nil, fmt.Errorf("error creating credentials: %v", err) + } + + // Create user namespace. + // TODO: Not clear what domain name should be here. It is + // not configurable from runtime spec. + utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace) + + ipcns := kernel.NewIPCNamespace() + + if err := enableStrace(conf); err != nil { + return nil, fmt.Errorf("failed to enable strace: %v", err) + } + + // Get the executable path, which is a bit tricky because we have to + // inspect the environment PATH which is relative to the root path. + exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env) + if err != nil { + return nil, fmt.Errorf("error getting executable path: %v", err) + } + + // Create the process arguments. + procArgs := kernel.CreateProcessArgs{ + Filename: exec, + Argv: spec.Process.Args, + Envv: spec.Process.Env, + WorkingDirectory: spec.Process.Cwd, + Credentials: creds, + // Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so + // it must wait until we have a Kernel. + Umask: uint(syscall.Umask(0)), + Limits: ls, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + UTSNamespace: utsns, + IPCNamespace: ipcns, + } + + // Create an empty network stack because the network namespace may be empty at + // this point. Netns is configured before Run() is called. Netstack is + // configured using a control uRPC message. Host network is configured inside + // Run(). + networkStack := newEmptyNetworkStack(conf) + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + NetworkStack: networkStack, + ApplicationCores: 8, + Vdso: vdso, + RootUTSNamespace: utsns, + RootIPCNamespace: ipcns, + }); err != nil { + return nil, fmt.Errorf("error initializing kernel: %v", err) + } + + // Turn on packet logging if enabled. + if conf.LogPackets { + log.Infof("Packet logging enabled") + atomic.StoreUint32(&sniffer.LogPackets, 1) + } else { + log.Infof("Packet logging disabled") + atomic.StoreUint32(&sniffer.LogPackets, 0) + } + + // Create the control server using the provided FD. + // + // This must be done *after* we have initialized the kernel since the + // controller is used to configure the kernel's network stack. + // + // This should also be *before* we create the process, since a + // misconfigured process will cause an error, and we want the control + // server up before that so that we don't time out trying to connect to + // it. + ctrl, err := newController(controllerFD, k) + if err != nil { + return nil, fmt.Errorf("error creating control server: %v", err) + } + + ctx := procArgs.NewContext(k) + + // Create the virtual filesystem. + mm, err := createMountNamespace(ctx, spec, conf, ioFDs) + if err != nil { + return nil, fmt.Errorf("error creating mounts: %v", err) + } + k.SetRootMountNamespace(mm) + + // Create the FD map, which will set stdin, stdout, and stderr. If console + // is true, then ioctl calls will be passed through to the host fd. + fdm, err := createFDMap(ctx, k, ls, console) + if err != nil { + return nil, fmt.Errorf("error importing fds: %v", err) + } + + // CreateProcess takes a reference on FDMap if successful. We + // won't need ours either way. + procArgs.FDMap = fdm + + // We don't care about child signals; some platforms can generate a + // tremendous number of useless ones (I'm looking at you, ptrace). + if err := sighandling.IgnoreChildStop(); err != nil { + return nil, fmt.Errorf("failed to ignore child stop signals: %v", err) + } + // Ensure that most signals received in sentry context are forwarded to + // the emulated kernel. + stopSignalForwarding := sighandling.StartForwarding(k) + + watchdog := watchdog.New(k, watchdog.DefaultTimeout, watchdog.LogWarning) + return &Loader{ + k: k, + ctrl: ctrl, + conf: conf, + console: console, + watchdog: watchdog, + stopSignalForwarding: stopSignalForwarding, + procArgs: procArgs, + }, nil +} + +// Destroy cleans up all resources used by the loader. +func (l *Loader) Destroy() { + if l.ctrl != nil { + // Shut down control server. + l.ctrl.srv.Stop() + } + l.stopSignalForwarding() + l.watchdog.Stop() +} + +func createPlatform(conf *Config) (platform.Platform, error) { + switch conf.Platform { + case PlatformPtrace: + log.Infof("Platform: ptrace") + return ptrace.New() + case PlatformKVM: + log.Infof("Platform: kvm") + return kvm.New() + default: + return nil, fmt.Errorf("invalid platform %v", conf.Platform) + } +} + +// Run runs the application. +func (l *Loader) Run() error { + err := l.run() + l.ctrl.app.startResultChan <- err + return err +} + +func (l *Loader) run() error { + if l.conf.Network == NetworkHost { + // Delay host network configuration to this point because network namespace + // is configured after the loader is created and before Run() is called. + log.Debugf("Configuring host network") + stack := l.k.NetworkStack().(*hostinet.Stack) + if err := stack.Configure(); err != nil { + return err + } + } + + // Finally done with all configuration. Setup filters before user code + // is loaded. + if l.conf.DisableSeccomp { + filter.Report("syscall filter is DISABLED. Running in less secure mode.") + } else { + whitelistFS := l.conf.FileAccess == FileAccessDirect + hostNet := l.conf.Network == NetworkHost + if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil { + return fmt.Errorf("Failed to install seccomp filters: %v", err) + } + } + + // Create the initial application task. + if _, err := l.k.CreateProcess(l.procArgs); err != nil { + return fmt.Errorf("failed to create init process: %v", err) + } + + // CreateProcess takes a reference on FDMap if successful. + l.procArgs.FDMap.DecRef() + + l.watchdog.Start() + return l.k.Start() +} + +// WaitForStartSignal waits for a start signal from the control server. +func (l *Loader) WaitForStartSignal() { + <-l.ctrl.app.startChan +} + +// WaitExit waits for the application to exit, and returns the application's +// exit status. +func (l *Loader) WaitExit() kernel.ExitStatus { + // Wait for application. + l.k.WaitExited() + + return l.k.GlobalInit().ExitStatus() +} + +func newEmptyNetworkStack(conf *Config) inet.Stack { + switch conf.Network { + case NetworkHost: + return hostinet.NewStack() + + case NetworkNone, NetworkSandbox: + // NetworkNone sets up loopback using netstack. + netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName} + protoNames := []string{tcp.ProtocolName, udp.ProtocolName} + return &epsocket.Stack{stack.New(netProtos, protoNames)} + + default: + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + } +} diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go new file mode 100644 index 000000000..2fc16b241 --- /dev/null +++ b/runsc/boot/loader_test.go @@ -0,0 +1,238 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "os" + "testing" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" +) + +func init() { + log.SetLevel(log.Debug) +} + +// testSpec returns a simple spec that can be used in tests. +func testSpec() *specs.Spec { + return &specs.Spec{ + // The host filesystem root is the sandbox root. + Root: &specs.Root{ + Path: "/", + Readonly: true, + }, + Process: &specs.Process{ + Args: []string{"/bin/true"}, + }, + } +} + +func createLoader() (*Loader, error) { + fd, err := server.CreateSocket(ControlSocketAddr("123")) + if err != nil { + return nil, err + } + conf := &Config{ + RootDir: "unused_root_dir", + Network: NetworkNone, + FileAccess: FileAccessDirect, + DisableSeccomp: true, + } + return New(testSpec(), conf, fd, nil, false) +} + +// TestRun runs a simple application in a sandbox and checks that it succeeds. +func TestRun(t *testing.T) { + s, err := createLoader() + if err != nil { + t.Fatalf("error creating loader: %v", err) + } + defer s.Destroy() + + // Run the application. + if err := s.Run(); err != nil { + t.Errorf("error running application: %v", err) + } + + // Wait for the application to exit. It should succeed. + if status := s.WaitExit(); status.Code != 0 || status.Signo != 0 { + t.Errorf("application exited with status %+v, want 0", status) + } +} + +// TestStartSignal tests that the controller Start message will cause +// WaitForStartSignal to return. +func TestStartSignal(t *testing.T) { + s, err := createLoader() + if err != nil { + t.Fatalf("error creating loader: %v", err) + } + defer s.Destroy() + + // We aren't going to wait on this application, so the control server + // needs to be shut down manually. + defer s.ctrl.srv.Stop() + + // Start a goroutine that calls WaitForStartSignal and writes to a + // channel when it returns. + waitFinished := make(chan struct{}) + go func() { + s.WaitForStartSignal() + // Pretent that Run() executed and returned no error. + s.ctrl.app.startResultChan <- nil + waitFinished <- struct{}{} + }() + + // Nothing has been written to the channel, so waitFinished should not + // return. Give it a little bit of time to make sure the goroutine has + // started. + select { + case <-waitFinished: + t.Errorf("WaitForStartSignal completed but it should not have") + case <-time.After(50 * time.Millisecond): + // OK. + } + + // Trigger the control server Start method. + if err := s.ctrl.app.Start(nil, nil); err != nil { + t.Errorf("error calling Start: %v", err) + } + + // Now WaitForStartSignal should return (within a short amount of + // time). + select { + case <-waitFinished: + // OK. + case <-time.After(50 * time.Millisecond): + t.Errorf("WaitForStartSignal did not complete but it should have") + } + +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespace(t *testing.T) { + conf := &Config{ + RootDir: "unused_root_dir", + FileAccess: FileAccessDirect, + DisableSeccomp: true, + } + + testCases := []struct { + name string + // Spec that will be used to create the mount manager. Note + // that we can't mount procfs without a kernel, so each spec + // MUST contain something other than procfs mounted at /proc. + spec specs.Spec + // Paths that are expected to exist in the resulting fs. + expectedPaths []string + }{ + { + // Only proc. + name: "only proc mount", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + // /proc, /dev, and /sys should always be mounted. + expectedPaths: []string{"/proc", "/dev", "/sys"}, + }, + { + // Mount at a deep path, with many components that do + // not exist in the root. + name: "deep mount path", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/some/very/very/deep/path", + Type: "tmpfs", + }, + { + Destination: "/proc", + Type: "tmpfs", + }, + }, + }, + // /some/deep/path should be mounted, along with /proc, + // /dev, and /sys. + expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, + }, + { + // Mounts are nested inside eachother. + name: "nested mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", + }, + { + Destination: "/foo", + Type: "tmpfs", + }, + { + Destination: "/foo/bar", + Type: "tmpfs", + }, + { + Destination: "/foo/bar/baz", + Type: "tmpfs", + }, + { + // A deep path that is in foo but not the other mounts. + Destination: "/foo/some/very/very/deep/path", + Type: "tmpfs", + }, + }, + }, + expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, + }, + } + + for _, tc := range testCases { + ctx := contexttest.Context(t) + mm, err := createMountNamespace(ctx, &tc.spec, conf, nil) + if err != nil { + t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err) + } + defer mm.DecRef() + root := mm.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + if _, err := mm.FindInode(ctx, root, root, p, 0); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } + } + } +} diff --git a/runsc/boot/network.go b/runsc/boot/network.go new file mode 100644 index 000000000..d2b52c823 --- /dev/null +++ b/runsc/boot/network.go @@ -0,0 +1,213 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "math/rand" + "net" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/fdbased" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/loopback" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer" + "gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp" + "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4" + "gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/urpc" +) + +// Network exposes methods that can be used to configure a network stack. +type Network struct { + Stack *stack.Stack +} + +// Route represents a route in the network stack. +type Route struct { + Destination net.IP + Mask net.IPMask + Gateway net.IP +} + +// DefaultRoute represents a catch all route to the default gateway. +type DefaultRoute struct { + Route Route + Name string +} + +// FDBasedLink configures an fd-based link. +type FDBasedLink struct { + Name string + MTU int + Addresses []net.IP + Routes []Route +} + +// LoopbackLink configures a loopback li nk. +type LoopbackLink struct { + Name string + Addresses []net.IP + Routes []Route +} + +// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes. +type CreateLinksAndRoutesArgs struct { + // FilePayload contains the fds associated with the FDBasedLinks. The + // two slices must have the same length. + urpc.FilePayload + + LoopbackLinks []LoopbackLink + FDBasedLinks []FDBasedLink + + DefaultGateway DefaultRoute +} + +// Empty returns true if route hasn't been set. +func (r *Route) Empty() bool { + return r.Destination == nil && r.Mask == nil && r.Gateway == nil +} + +func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route { + return tcpip.Route{ + Destination: ipToAddress(r.Destination), + Gateway: ipToAddress(r.Gateway), + Mask: ipToAddress(net.IP(r.Mask)), + NIC: id, + } +} + +// CreateLinksAndRoutes creates links and routes in a network stack. It should +// only be called once. +func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error { + if len(args.FilePayload.Files) != len(args.FDBasedLinks) { + return fmt.Errorf("FilePayload must be same length at FDBasedLinks") + } + + var nicID tcpip.NICID + nicids := make(map[string]tcpip.NICID) + + // Collect routes from all links. + var routes []tcpip.Route + + // Loopback normally appear before other interfaces. + for _, link := range args.LoopbackLinks { + nicID++ + nicids[link.Name] = nicID + + linkEP := loopback.New() + + log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return err + } + + // Collect the routes from this link. + for _, r := range link.Routes { + routes = append(routes, r.toTcpipRoute(nicID)) + } + } + + for i, link := range args.FDBasedLinks { + nicID++ + nicids[link.Name] = nicID + + // Copy the underlying FD. + oldFD := args.FilePayload.Files[i].Fd() + newFD, err := syscall.Dup(int(oldFD)) + if err != nil { + return fmt.Errorf("failed to dup FD %v: %v", oldFD, err) + } + + linkEP := fdbased.New(&fdbased.Options{ + FD: newFD, + MTU: uint32(link.MTU), + ChecksumOffload: false, + EthernetHeader: true, + Address: tcpip.LinkAddress(generateRndMac()), + }) + + log.Infof("Enabling interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses) + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return err + } + + // Collect the routes from this link. + for _, r := range link.Routes { + routes = append(routes, r.toTcpipRoute(nicID)) + } + } + + if !args.DefaultGateway.Route.Empty() { + nicID, ok := nicids[args.DefaultGateway.Name] + if !ok { + return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name) + } + routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID)) + } + + log.Infof("Setting routes %+v", routes) + n.Stack.SetRouteTable(routes) + return nil +} + +// createNICWithAddrs creates a NIC in the network stack and adds the given +// addresses. +func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP) error { + if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil { + return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err) + } + + // Always start with an arp address for the NIC. + if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err) + } + + for _, addr := range addrs { + proto, tcpipAddr := ipToAddressAndProto(addr) + if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil { + return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err) + } + } + return nil +} + +// ipToAddressAndProto converts IP to tcpip.Address and a protocol number. +// +// Note: don't use 'len(ip)' to determine IP version because length is always 16. +func ipToAddressAndProto(ip net.IP) (tcpip.NetworkProtocolNumber, tcpip.Address) { + if i4 := ip.To4(); i4 != nil { + return ipv4.ProtocolNumber, tcpip.Address(i4) + } + return ipv6.ProtocolNumber, tcpip.Address(ip) +} + +// ipToAddress converts IP to tcpip.Address, ignoring the protocol. +func ipToAddress(ip net.IP) tcpip.Address { + _, addr := ipToAddressAndProto(ip) + return addr +} + +// generateRndMac returns a random local MAC address. +// Copied from eth_random_addr() (include/linux/etherdevice.h) +func generateRndMac() net.HardwareAddr { + mac := make(net.HardwareAddr, 6) + rand.Read(mac) + mac[0] &^= 0x1 // clear multicast bit + mac[0] |= 0x2 // set local assignment bit (IEEE802) + return mac +} diff --git a/runsc/boot/strace.go b/runsc/boot/strace.go new file mode 100644 index 000000000..1e898672b --- /dev/null +++ b/runsc/boot/strace.go @@ -0,0 +1,40 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/strace" +) + +func enableStrace(conf *Config) error { + // We must initialize even if strace is not enabled. + strace.Initialize() + + if !conf.Strace { + return nil + } + + max := conf.StraceLogSize + if max == 0 { + max = 1024 + } + strace.LogMaximumSize = max + + if len(conf.StraceSyscalls) == 0 { + strace.EnableAll(strace.SinkTypeLog) + return nil + } + return strace.Enable(conf.StraceSyscalls, strace.SinkTypeLog) +} diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD new file mode 100644 index 000000000..128c8f7e6 --- /dev/null +++ b/runsc/cmd/BUILD @@ -0,0 +1,58 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "cmd", + srcs = [ + "boot.go", + "cmd.go", + "create.go", + "delete.go", + "events.go", + "exec.go", + "gofer.go", + "kill.go", + "list.go", + "path.go", + "ps.go", + "run.go", + "start.go", + "state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/runsc/cmd", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/log", + "//pkg/p9", + "//pkg/sentry/control", + "//pkg/sentry/kernel/auth", + "//pkg/unet", + "//pkg/urpc", + "//runsc/boot", + "//runsc/fsgofer", + "//runsc/sandbox", + "//runsc/specutils", + "@com_github_google_subcommands//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "cmd_test", + size = "small", + srcs = ["exec_test.go"], + embed = [":cmd"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/control", + "//pkg/sentry/kernel/auth", + "//pkg/urpc", + "@com_github_google_go-cmp//cmp:go_default_library", + "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go new file mode 100644 index 000000000..0dad6da79 --- /dev/null +++ b/runsc/cmd/boot.go @@ -0,0 +1,161 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "os" + "runtime" + "runtime/debug" + "strings" + "syscall" + + "context" + "flag" + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// Boot implements subcommands.Command for the "boot" command which starts a +// new sandbox. It should not be called directly. +type Boot struct { + // bundleDir is the path to the bundle directory. + bundleDir string + + // controllerFD is the file descriptor of a stream socket for the + // control server that is donated to this process. + controllerFD int + + // ioFDs is the list of FDs used to connect to FS gofers. + ioFDs intFlags + + // console is set to true if the sandbox should allow terminal ioctl(2) + // syscalls. + console bool + + // applyCaps determines if capabilities defined in the spec should be applied + // to the process. + applyCaps bool +} + +// Name implements subcommands.Command.Name. +func (*Boot) Name() string { + return "boot" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Boot) Synopsis() string { + return "launch a sandbox process (internal use only)" +} + +// Usage implements subcommands.Command.Usage. +func (*Boot) Usage() string { + return `boot [flags]` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (b *Boot) SetFlags(f *flag.FlagSet) { + f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") + f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") + f.Var(&b.ioFDs, "io-fds", "list of FDs to connect 9P clients. They must follow this order: root first, then mounts as defined in the spec") + f.BoolVar(&b.console, "console", false, "set to true if the sandbox should allow terminal ioctl(2) syscalls") + f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") +} + +// Execute implements subcommands.Command.Execute. It starts a sandbox in a +// waiting state. +func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if b.bundleDir == "" || b.controllerFD == -1 || f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + + // Ensure that if there is a panic, all goroutine stacks are printed. + debug.SetTraceback("all") + + // Get the spec from the bundleDir. + spec, err := specutils.ReadSpec(b.bundleDir) + if err != nil { + Fatalf("error reading spec: %v", err) + } + specutils.LogSpec(spec) + + // Turn any relative paths in the spec to absolute by prepending the bundleDir. + spec.Root.Path = absPath(b.bundleDir, spec.Root.Path) + for _, m := range spec.Mounts { + if m.Source != "" { + m.Source = absPath(b.bundleDir, m.Source) + } + } + + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + if b.applyCaps { + setCapsAndCallSelf(conf, spec) + Fatalf("setCapsAndCallSelf must never return") + } + + // Create the loader. + s, err := boot.New(spec, conf, b.controllerFD, b.ioFDs.GetArray(), b.console) + if err != nil { + Fatalf("error creating loader: %v", err) + } + defer s.Destroy() + + // Wait for the start signal from runsc. + s.WaitForStartSignal() + + // Run the application and wait for it to finish. + if err := s.Run(); err != nil { + Fatalf("error running sandbox: %v", err) + } + + ws := s.WaitExit() + log.Infof("application exiting with %+v", ws) + *waitStatus = syscall.WaitStatus(ws.Status()) + return subcommands.ExitSuccess +} + +// setCapsAndCallSelf sets capabilities to the current thread and then execve's +// itself again with the same arguments except '--apply-caps' to restart the +// whole process with the desired capabilities. +func setCapsAndCallSelf(conf *boot.Config, spec *specs.Spec) { + // Keep thread locked while capabilities are changed. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if err := boot.ApplyCaps(conf, spec.Process.Capabilities); err != nil { + Fatalf("ApplyCaps, err: %v", err) + } + binPath, err := specutils.BinPath() + if err != nil { + Fatalf("%v", err) + } + + // Remove --apply-caps arg to call myself. + var args []string + for _, arg := range os.Args { + if !strings.Contains(arg, "apply-caps") { + args = append(args, arg) + } + } + + log.Infof("Execve 'boot' again, bye!") + log.Infof("%s %v", binPath, args) + syscall.Exec(binPath, args, []string{}) +} diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go new file mode 100644 index 000000000..d4b834213 --- /dev/null +++ b/runsc/cmd/cmd.go @@ -0,0 +1,77 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cmd holds implementations of the runsc commands. +package cmd + +import ( + "fmt" + "os" + "strconv" + + "flag" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// Fatalf logs to stderr and exits with a failure status code. +func Fatalf(s string, args ...interface{}) { + // If runsc is being invoked by docker or cri-o, then we might not have + // access to stderr, so we log a serious-looking warning in addition to + // writing to stderr. + log.Warningf("FATAL ERROR: "+s, args...) + fmt.Fprintf(os.Stderr, s+"\n", args...) + // Return an error that is unlikely to be used by the application. + os.Exit(128) +} + +// commandLineFlags returns a slice of all top-level command line flags that +// have been set. +func commandLineFlags() []string { + var args []string + flag.CommandLine.Visit(func(f *flag.Flag) { + args = append(args, fmt.Sprintf("--%s=%s", f.Name, f.Value.String())) + }) + return args +} + +// intFlags can be used with int flags that appear multiple times. +type intFlags []int + +// String implements flag.Value. +func (i *intFlags) String() string { + return fmt.Sprintf("%v", *i) +} + +// Get implements flag.Value. +func (i *intFlags) Get() interface{} { + return i +} + +// GetArray returns array of FDs. +func (i *intFlags) GetArray() []int { + return *i +} + +// Set implements flag.Value. +func (i *intFlags) Set(s string) error { + fd, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("invalid flag value: %v", err) + } + if fd < 0 { + return fmt.Errorf("flag value must be greater than 0: %d", fd) + } + *i = append(*i, fd) + return nil +} diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go new file mode 100644 index 000000000..83cb09eb0 --- /dev/null +++ b/runsc/cmd/create.go @@ -0,0 +1,93 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// Create implements subcommands.Command for the "create" command. +type Create struct { + // bundleDir is the path to the bundle directory (defaults to the + // current working directory). + bundleDir string + + // pidFile is the filename that the sandbox pid will be written to. + // This file should only be created once the sandbox process is ready + // to use (i.e. control server has started and is listening). + pidFile string + + // consoleSocket is the path to an AF_UNIX socket which will receive a + // file descriptor referencing the master end of the console's + // pseudoterminal. This is ignored unless spec.Process.Terminal is + // true. + consoleSocket string +} + +// Name implements subcommands.Command.Name. +func (*Create) Name() string { + return "create" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Create) Synopsis() string { + return "create a secure container" +} + +// Usage implements subcommands.Command.Usage. +func (*Create) Usage() string { + return `create [flags] <container id> - create a secure container +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *Create) SetFlags(f *flag.FlagSet) { + f.StringVar(&c.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") + f.StringVar(&c.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal") + f.StringVar(&c.pidFile, "pid-file", "", "filename that the sandbox pid will be written to") +} + +// Execute implements subcommands.Command.Execute. +func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + bundleDir := c.bundleDir + if bundleDir == "" { + bundleDir = getwdOrDie() + } + spec, err := specutils.ReadSpec(bundleDir) + if err != nil { + Fatalf("error reading spec: %v", err) + } + specutils.LogSpec(spec) + + // Create the sandbox process, passing additional command line + // arguments to the sandbox process. + if _, err := sandbox.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, commandLineFlags()); err != nil { + Fatalf("error creating sandbox: %v", err) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go new file mode 100644 index 000000000..a497c034d --- /dev/null +++ b/runsc/cmd/delete.go @@ -0,0 +1,74 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// Delete implements subcommands.Command for the "delete" command. +type Delete struct { + // force indicates that the sandbox should be terminated if running. + force bool +} + +// Name implements subcommands.Command.Name. +func (*Delete) Name() string { + return "delete" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Delete) Synopsis() string { + return "delete resources held by a container" +} + +// Usage implements subcommands.Command.Usage. +func (*Delete) Usage() string { + return `delete [flags] <container ids>` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (d *Delete) SetFlags(f *flag.FlagSet) { + f.BoolVar(&d.force, "force", false, "terminate sandbox if running") +} + +// Execute implements subcommands.Command.Execute. +func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() == 0 { + f.Usage() + return subcommands.ExitUsageError + } + + conf := args[0].(*boot.Config) + + for i := 0; i < f.NArg(); i++ { + id := f.Arg(i) + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandbox %q: %v", id, err) + } + if !d.force && (s.Status == sandbox.Running) { + Fatalf("cannot stop running sandbox without --force flag") + } + if err := s.Destroy(); err != nil { + Fatalf("error destroying sandbox: %v", err) + } + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go new file mode 100644 index 000000000..afd42c2f2 --- /dev/null +++ b/runsc/cmd/events.go @@ -0,0 +1,111 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "os" + "time" + + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// Events implements subcommands.Command for the "events" command. +type Events struct { + // The interval between stats reporting. + intervalSec int + // If true, events will print a single group of stats and exit. + stats bool +} + +// Name implements subcommands.Command.Name. +func (*Events) Name() string { + return "events" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Events) Synopsis() string { + return "display container events such as OOM notifications, cpu, memory, and IO usage statistics" +} + +// Usage implements subcommands.Command.Usage. +func (*Events) Usage() string { + return `<container-id> + +Where "<container-id>" is the name for the instance of the container. + +The events command displays information about the container. By default the +information is displayed once every 5 seconds. + +OPTIONS: +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (evs *Events) SetFlags(f *flag.FlagSet) { + f.IntVar(&evs.intervalSec, "interval", 5, "set the stats collection interval, in seconds") + f.BoolVar(&evs.stats, "stats", false, "display the container's stats then exit") +} + +// Execute implements subcommands.Command.Execute. +func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandox: %v", err) + } + + // Repeatedly get stats from the container. + for { + // Get the event and print it as JSON. + ev, err := s.Event() + if err != nil { + log.Warningf("error getting events for sandbox: %v", err) + } + // err must be preserved because it is used below when breaking + // out of the loop. + b, err := json.Marshal(ev) + if err != nil { + log.Warningf("error while marshalling event %v: %v", ev, err) + } else { + os.Stdout.Write(b) + } + + // If we're only running once, break. If we're only running + // once and there was an error, the command failed. + if evs.stats { + if err != nil { + return subcommands.ExitFailure + } + break + } + + time.Sleep(time.Duration(evs.intervalSec) * time.Second) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go new file mode 100644 index 000000000..8379f552d --- /dev/null +++ b/runsc/cmd/exec.go @@ -0,0 +1,375 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "strconv" + "strings" + "syscall" + "time" + + "context" + "flag" + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// Exec implements subcommands.Command for the "exec" command. +type Exec struct { + cwd string + env stringSlice + // user contains the UID and GID with which to run the new process. + user user + extraKGIDs stringSlice + caps stringSlice + detach bool + processPath string + pidFile string +} + +// Name implements subcommands.Command.Name. +func (*Exec) Name() string { + return "exec" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Exec) Synopsis() string { + return "execute new process inside the container" +} + +// Usage implements subcommands.Command.Usage. +func (*Exec) Usage() string { + return `exec [command options] <container-id> <command> [command options] || --process process.json <container-id> + + +Where "<container-id>" is the name for the instance of the container and +"<command>" is the command to be executed in the container. +"<command>" can't be empty unless a "-process" flag provided. + +EXAMPLE: +If the container is configured to run /bin/ps the following will +output a list of processes running in the container: + + # runc exec <container-id> ps + +OPTIONS: +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (ex *Exec) SetFlags(f *flag.FlagSet) { + f.StringVar(&ex.cwd, "cwd", "", "current working directory") + f.Var(&ex.env, "env", "set environment variables (e.g. '-env PATH=/bin -env TERM=xterm')") + f.Var(&ex.user, "user", "UID (format: <uid>[:<gid>])") + f.Var(&ex.extraKGIDs, "additional-gids", "additional gids") + f.Var(&ex.caps, "cap", "add a capability to the bounding set for the process") + f.BoolVar(&ex.detach, "detach", false, "detach from the container's process") + f.StringVar(&ex.processPath, "process", "", "path to the process.json") + f.StringVar(&ex.pidFile, "pid-file", "", "filename that the sandbox pid will be written to") +} + +// Execute implements subcommands.Command.Execute. It starts a process in an +// already created sandbox. +func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + e, id, err := ex.parseArgs(f) + if err != nil { + Fatalf("error parsing process spec: %v", err) + } + e.Detach = ex.detach + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandox: %v", err) + } + + if e.WorkingDirectory == "" { + e.WorkingDirectory = s.Spec.Process.Cwd + } + + if e.Envv == nil { + e.Envv, err = resolveEnvs(s.Spec.Process.Env, ex.env) + if err != nil { + Fatalf("error getting environment variables: %v", err) + } + } + + // containerd expects an actual process to represent the container being + // executed. If detach was specified, starts a child in non-detach mode, + // write the child's PID to the pid file. So when the container returns, the + // child process will also return and signal containerd. + if e.Detach { + binPath, err := specutils.BinPath() + if err != nil { + Fatalf("error getting bin path: %v", err) + } + var args []string + for _, a := range os.Args[1:] { + if !strings.Contains(a, "detach") { + args = append(args, a) + } + } + cmd := exec.Command(binPath, args...) + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + Fatalf("failure to start child exec process, err: %v", err) + } + + log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args) + + // Wait for PID file to ensure that child process has started. Otherwise, + // '--process' file is deleted as soon as this process returns and the child + // may fail to read it. + sleepTime := 10 * time.Millisecond + for start := time.Now(); time.Now().Sub(start) < 10*time.Second; { + _, err := os.Stat(ex.pidFile) + if err == nil { + break + } + if pe, ok := err.(*os.PathError); !ok || pe.Err != syscall.ENOENT { + Fatalf("unexpected error waiting for PID file, err: %v", err) + } + + log.Infof("Waiting for PID file to be created...") + time.Sleep(sleepTime) + sleepTime *= sleepTime * 2 + if sleepTime > 1*time.Second { + sleepTime = 1 * time.Second + } + } + *waitStatus = 0 + return subcommands.ExitSuccess + } + + if ex.pidFile != "" { + if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil { + Fatalf("error writing pid file: %v", err) + } + } + + // Get the executable path, which is a bit tricky because we have to + // inspect the environment PATH which is relative to the root path. + // If the user is overriding environment variables, PATH may have been + // overwritten. + rootPath := s.Spec.Root.Path + e.Filename, err = specutils.GetExecutablePath(e.Argv[0], rootPath, e.Envv) + if err != nil { + Fatalf("error getting executable path: %v", err) + } + + ws, err := s.Execute(e) + if err != nil { + Fatalf("error getting processes for sandbox: %v", err) + } + *waitStatus = ws + return subcommands.ExitSuccess +} + +// parseArgs parses exec information from the command line or a JSON file +// depending on whether the --process flag was used. Returns an ExecArgs and +// the ID of the sandbox to be used. +func (ex *Exec) parseArgs(f *flag.FlagSet) (*control.ExecArgs, string, error) { + if ex.processPath == "" { + // Requires at least a container ID and command. + if f.NArg() < 2 { + f.Usage() + return nil, "", fmt.Errorf("both a container-id and command are required") + } + e, err := ex.argsFromCLI(f.Args()[1:]) + return e, f.Arg(0), err + } + // Requires only the container ID. + if f.NArg() != 1 { + f.Usage() + return nil, "", fmt.Errorf("a container-id is required") + } + e, err := ex.argsFromProcessFile() + return e, f.Arg(0), err +} + +func (ex *Exec) argsFromCLI(argv []string) (*control.ExecArgs, error) { + extraKGIDs := make([]auth.KGID, 0, len(ex.extraKGIDs)) + for _, s := range ex.extraKGIDs { + kgid, err := strconv.Atoi(s) + if err != nil { + Fatalf("error parsing GID: %s, %v", s, err) + } + extraKGIDs = append(extraKGIDs, auth.KGID(kgid)) + } + + caps, err := capabilities(ex.caps) + if err != nil { + return nil, fmt.Errorf("capabilities error: %v", err) + } + + return &control.ExecArgs{ + Argv: argv, + WorkingDirectory: ex.cwd, + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + KUID: ex.user.kuid, + KGID: ex.user.kgid, + ExtraKGIDs: extraKGIDs, + Capabilities: caps, + }, nil +} + +func (ex *Exec) argsFromProcessFile() (*control.ExecArgs, error) { + f, err := os.Open(ex.processPath) + if err != nil { + return nil, fmt.Errorf("error opening process file: %s, %v", ex.processPath, err) + } + defer f.Close() + var p specs.Process + if err := json.NewDecoder(f).Decode(&p); err != nil { + return nil, fmt.Errorf("error parsing process file: %s, %v", ex.processPath, err) + } + return argsFromProcess(&p) +} + +// argsFromProcess performs all the non-IO conversion from the Process struct +// to ExecArgs. +func argsFromProcess(p *specs.Process) (*control.ExecArgs, error) { + // Create capabilities. + caps, err := specutils.Capabilities(p.Capabilities) + if err != nil { + return nil, fmt.Errorf("error creating capabilities: %v", err) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(p.User.AdditionalGids)) + for _, GID := range p.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + return &control.ExecArgs{ + Argv: p.Args, + Envv: p.Env, + WorkingDirectory: p.Cwd, + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + KUID: auth.KUID(p.User.UID), + KGID: auth.KGID(p.User.GID), + ExtraKGIDs: extraKGIDs, + Capabilities: caps, + }, nil +} + +// resolveEnvs transforms lists of environment variables into a single list of +// environment variables. If a variable is defined multiple times, the last +// value is used. +func resolveEnvs(envs ...[]string) ([]string, error) { + // First create a map of variable names to values. This removes any + // duplicates. + envMap := make(map[string]string) + for _, env := range envs { + for _, str := range env { + parts := strings.SplitN(str, "=", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("invalid variable: %s", str) + } + envMap[parts[0]] = parts[1] + } + } + // Reassemble envMap into a list of environment variables of the form + // NAME=VALUE. + env := make([]string, 0, len(envMap)) + for k, v := range envMap { + env = append(env, fmt.Sprintf("%s=%s", k, v)) + } + return env, nil +} + +// capabilities takes a list of capabilities as strings and returns an +// auth.TaskCapabilities struct with those capabilities in every capability set. +// This mimics runc's behavior. +func capabilities(cs []string) (*auth.TaskCapabilities, error) { + var specCaps specs.LinuxCapabilities + for _, cap := range cs { + specCaps.Ambient = append(specCaps.Ambient, cap) + specCaps.Bounding = append(specCaps.Bounding, cap) + specCaps.Effective = append(specCaps.Effective, cap) + specCaps.Inheritable = append(specCaps.Inheritable, cap) + specCaps.Permitted = append(specCaps.Permitted, cap) + } + return specutils.Capabilities(&specCaps) +} + +// stringSlice allows a flag to be used multiple times, where each occurrence +// adds a value to the flag. For example, a flag called "x" could be invoked +// via "runsc exec -x foo -x bar", and the corresponding stringSlice would be +// {"x", "y"}. +type stringSlice []string + +// String implements flag.Value.String. +func (ss *stringSlice) String() string { + return fmt.Sprintf("%v", *ss) +} + +// Get implements flag.Value.Get. +func (ss *stringSlice) Get() interface{} { + return ss +} + +// Set implements flag.Value.Set. +func (ss *stringSlice) Set(s string) error { + *ss = append(*ss, s) + return nil +} + +// user allows -user to convey a UID and, optionally, a GID separated by a +// colon. +type user struct { + kuid auth.KUID + kgid auth.KGID +} + +func (u *user) String() string { + return fmt.Sprintf("%+v", *u) +} + +func (u *user) Get() interface{} { + return u +} + +func (u *user) Set(s string) error { + parts := strings.SplitN(s, ":", 2) + kuid, err := strconv.Atoi(parts[0]) + if err != nil { + return fmt.Errorf("couldn't parse UID: %s", parts[0]) + } + u.kuid = auth.KUID(kuid) + if len(parts) > 1 { + kgid, err := strconv.Atoi(parts[1]) + if err != nil { + return fmt.Errorf("couldn't parse GID: %s", parts[1]) + } + u.kgid = auth.KGID(kgid) + } + return nil +} diff --git a/runsc/cmd/exec_test.go b/runsc/cmd/exec_test.go new file mode 100644 index 000000000..623461e78 --- /dev/null +++ b/runsc/cmd/exec_test.go @@ -0,0 +1,154 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "os" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/urpc" +) + +func TestUser(t *testing.T) { + testCases := []struct { + input string + want user + wantErr bool + }{ + {input: "0", want: user{kuid: 0, kgid: 0}}, + {input: "7", want: user{kuid: 7, kgid: 0}}, + {input: "49:343", want: user{kuid: 49, kgid: 343}}, + {input: "0:2401", want: user{kuid: 0, kgid: 2401}}, + {input: "", wantErr: true}, + {input: "foo", wantErr: true}, + {input: ":123", wantErr: true}, + {input: "1:2:3", wantErr: true}, + } + + for _, tc := range testCases { + var u user + if err := u.Set(tc.input); err != nil && tc.wantErr { + // We got an error and wanted one. + continue + } else if err == nil && tc.wantErr { + t.Errorf("user.Set(%s): got no error, but wanted one", tc.input) + } else if err != nil && !tc.wantErr { + t.Errorf("user.Set(%s): got error %v, but wanted none", tc.input, err) + } else if u != tc.want { + t.Errorf("user.Set(%s): got %+v, but wanted %+v", tc.input, u, tc.want) + } + } +} + +func TestCLIArgs(t *testing.T) { + testCases := []struct { + ex Exec + argv []string + expected control.ExecArgs + }{ + { + ex: Exec{ + cwd: "/foo/bar", + user: user{kuid: 0, kgid: 0}, + extraKGIDs: []string{"1", "2", "3"}, + caps: []string{"CAP_DAC_OVERRIDE"}, + processPath: "", + }, + argv: []string{"ls", "/"}, + expected: control.ExecArgs{ + Argv: []string{"ls", "/"}, + WorkingDirectory: "/foo/bar", + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + KUID: 0, + KGID: 0, + ExtraKGIDs: []auth.KGID{1, 2, 3}, + Capabilities: &auth.TaskCapabilities{ + BoundingCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + PermittedCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + }, + }, + }, + } + + for _, tc := range testCases { + e, err := tc.ex.argsFromCLI(tc.argv) + if err != nil { + t.Errorf("argsFromCLI(%+v): got error: %+v", tc.ex, err) + } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) { + t.Errorf("argsFromCLI(%+v): got %+v, but expected %+v", tc.ex, *e, tc.expected) + } + } +} + +func TestJSONArgs(t *testing.T) { + testCases := []struct { + // ex is provided to make sure it is overridden by p. + ex Exec + p specs.Process + expected control.ExecArgs + }{ + { + ex: Exec{ + cwd: "/baz/quux", + user: user{kuid: 1, kgid: 1}, + extraKGIDs: []string{"4", "5", "6"}, + caps: []string{"CAP_SETGID"}, + processPath: "/bin/foo", + }, + p: specs.Process{ + User: specs.User{UID: 0, GID: 0, AdditionalGids: []uint32{1, 2, 3}}, + Args: []string{"ls", "/"}, + Cwd: "/foo/bar", + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{"CAP_DAC_OVERRIDE"}, + Effective: []string{"CAP_DAC_OVERRIDE"}, + Inheritable: []string{"CAP_DAC_OVERRIDE"}, + Permitted: []string{"CAP_DAC_OVERRIDE"}, + }, + }, + expected: control.ExecArgs{ + Argv: []string{"ls", "/"}, + WorkingDirectory: "/foo/bar", + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, os.Stdout, os.Stderr}}, + KUID: 0, + KGID: 0, + ExtraKGIDs: []auth.KGID{1, 2, 3}, + Capabilities: &auth.TaskCapabilities{ + BoundingCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + InheritableCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + PermittedCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + }, + }, + }, + } + + for _, tc := range testCases { + e, err := argsFromProcess(&tc.p) + if err != nil { + t.Errorf("argsFromProcess(%+v): got error: %+v", tc.p, err) + } else if !cmp.Equal(*e, tc.expected, cmpopts.IgnoreUnexported(os.File{})) { + t.Errorf("argsFromProcess(%+v): got %+v, but expected %+v", tc.p, *e, tc.expected) + } + } +} diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go new file mode 100644 index 000000000..844e16dbf --- /dev/null +++ b/runsc/cmd/gofer.go @@ -0,0 +1,134 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "sync" + + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/runsc/fsgofer" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// Gofer implements subcommands.Command for the "gofer" command, which starts a +// filesystem gofer. This command should not be called directly. +type Gofer struct { + bundleDir string + ioFDs intFlags +} + +// Name implements subcommands.Command. +func (*Gofer) Name() string { + return "gofer" +} + +// Synopsis implements subcommands.Command. +func (*Gofer) Synopsis() string { + return "launch a gofer process that server files over 9P protocol (internal use only)" +} + +// Usage implements subcommands.Command. +func (*Gofer) Usage() string { + return `gofer [flags]` +} + +// SetFlags implements subcommands.Command. +func (g *Gofer) SetFlags(f *flag.FlagSet) { + f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") + f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") +} + +// Execute implements subcommands.Command. +func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if g.bundleDir == "" || len(g.ioFDs) < 1 { + f.Usage() + return subcommands.ExitUsageError + } + + spec, err := specutils.ReadSpec(g.bundleDir) + if err != nil { + Fatalf("error reading spec: %v", err) + } + specutils.LogSpec(spec) + + // Start with root mount, then add any other addition mount as needed. + ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) + p := absPath(g.bundleDir, spec.Root.Path) + ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{ + ROMount: spec.Root.Readonly, + // Docker uses overlay2 by default for the root mount, and overlay2 does a copy-up when + // each file is opened as writable. Thus, we open files lazily to avoid copy-up. + LazyOpenForWrite: true, + })) + log.Infof("Serving %q mapped to %q on FD %d", "/", p, g.ioFDs[0]) + + mountIdx := 1 // first one is the root + for _, m := range spec.Mounts { + if specutils.Is9PMount(m) { + p = absPath(g.bundleDir, m.Source) + ats = append(ats, fsgofer.NewAttachPoint(p, fsgofer.Config{ + ROMount: isReadonlyMount(m.Options), + LazyOpenForWrite: false, + })) + + if mountIdx >= len(g.ioFDs) { + Fatalf("No FD found for mount. Did you forget --io-fd? mount: %d, %v", len(g.ioFDs), m) + } + log.Infof("Serving %q mapped to %q on FD %d", m.Destination, p, g.ioFDs[mountIdx]) + mountIdx++ + } + } + if mountIdx != len(g.ioFDs) { + Fatalf("Too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) + } + + runServers(ats, g.ioFDs) + return subcommands.ExitSuccess +} + +func runServers(ats []p9.Attacher, ioFDs []int) { + // Run the loops and wait for all to exit. + var wg sync.WaitGroup + for i, ioFD := range ioFDs { + wg.Add(1) + go func(ioFD int, at p9.Attacher) { + socket, err := unet.NewSocket(ioFD) + if err != nil { + Fatalf("err creating server on FD %d: %v", ioFD, err) + } + s := p9.NewServer(at) + if err := s.Handle(socket); err != nil { + Fatalf("P9 server returned error. Gofer is shutting down. FD: %d, err: %v", ioFD, err) + } + wg.Done() + }(ioFD, ats[i]) + } + wg.Wait() + log.Infof("All 9P servers exited.") +} + +func isReadonlyMount(opts []string) bool { + for _, o := range opts { + if o == "ro" { + return true + } + } + return false +} diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go new file mode 100644 index 000000000..f89e0077e --- /dev/null +++ b/runsc/cmd/kill.go @@ -0,0 +1,142 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "strconv" + "strings" + "syscall" + + "context" + "flag" + "github.com/google/subcommands" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// Kill implements subcommands.Command for the "kill" command. +type Kill struct{} + +// Name implements subcommands.Command.Name. +func (*Kill) Name() string { + return "kill" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Kill) Synopsis() string { + return "sends a signal to the sandbox" +} + +// Usage implements subcommands.Command.Usage. +func (*Kill) Usage() string { + return `kill <container id> [signal]` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*Kill) SetFlags(f *flag.FlagSet) { + // TODO: Implement this flag. It is defined here just to + // prevent runsc from crashing if it is passed. + var all bool + f.BoolVar(&all, "all", false, "send the specified signal to all processes inside the container") +} + +// Execute implements subcommands.Command.Execute. +func (*Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() == 0 || f.NArg() > 2 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandbox: %v", err) + } + + // The OCI command-line spec says that the signal should be specified + // via a flag, but runc (and things that call runc) pass it as an + // argument. + signal := f.Arg(2) + if signal == "" { + signal = "TERM" + } + + sig, err := parseSignal(signal) + if err != nil { + Fatalf("%v", err) + } + if err := s.Signal(sig); err != nil { + Fatalf("%v", err) + } + return subcommands.ExitSuccess +} + +func parseSignal(s string) (syscall.Signal, error) { + n, err := strconv.Atoi(s) + if err == nil { + sig := syscall.Signal(n) + for _, msig := range signalMap { + if sig == msig { + return sig, nil + } + } + return -1, fmt.Errorf("unknown signal %q", s) + } + if sig, ok := signalMap[strings.TrimPrefix(strings.ToUpper(s), "SIG")]; ok { + return sig, nil + } + return -1, fmt.Errorf("unknown signal %q", s) +} + +var signalMap = map[string]syscall.Signal{ + "ABRT": unix.SIGABRT, + "ALRM": unix.SIGALRM, + "BUS": unix.SIGBUS, + "CHLD": unix.SIGCHLD, + "CLD": unix.SIGCLD, + "CONT": unix.SIGCONT, + "FPE": unix.SIGFPE, + "HUP": unix.SIGHUP, + "ILL": unix.SIGILL, + "INT": unix.SIGINT, + "IO": unix.SIGIO, + "IOT": unix.SIGIOT, + "KILL": unix.SIGKILL, + "PIPE": unix.SIGPIPE, + "POLL": unix.SIGPOLL, + "PROF": unix.SIGPROF, + "PWR": unix.SIGPWR, + "QUIT": unix.SIGQUIT, + "SEGV": unix.SIGSEGV, + "STKFLT": unix.SIGSTKFLT, + "STOP": unix.SIGSTOP, + "SYS": unix.SIGSYS, + "TERM": unix.SIGTERM, + "TRAP": unix.SIGTRAP, + "TSTP": unix.SIGTSTP, + "TTIN": unix.SIGTTIN, + "TTOU": unix.SIGTTOU, + "URG": unix.SIGURG, + "USR1": unix.SIGUSR1, + "USR2": unix.SIGUSR2, + "VTALRM": unix.SIGVTALRM, + "WINCH": unix.SIGWINCH, + "XCPU": unix.SIGXCPU, + "XFSZ": unix.SIGXFSZ, +} diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go new file mode 100644 index 000000000..bf7cb41bb --- /dev/null +++ b/runsc/cmd/list.go @@ -0,0 +1,117 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "fmt" + "os" + "text/tabwriter" + "time" + + "context" + "flag" + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// List implements subcommands.Command for the "list" command for the "list" command. +type List struct { + quiet bool + format string +} + +// Name implements subcommands.command.name. +func (*List) Name() string { + return "list" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*List) Synopsis() string { + return "list contaners started by runsc with the given root" +} + +// Usage implements subcommands.Command.Usage. +func (*List) Usage() string { + return `list [flags]` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (l *List) SetFlags(f *flag.FlagSet) { + f.BoolVar(&l.quiet, "quiet", false, "only list container ids") + f.StringVar(&l.format, "format", "text", "output format: 'text' (default) or 'json'") +} + +// Execute implements subcommands.Command.Execute. +func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 0 { + f.Usage() + return subcommands.ExitUsageError + } + + conf := args[0].(*boot.Config) + ids, err := sandbox.List(conf.RootDir) + if err != nil { + Fatalf("%v", err) + } + + if l.quiet { + for _, id := range ids { + fmt.Println(id) + } + return subcommands.ExitSuccess + } + + // Collect the sandboxes. + var sandboxes []*sandbox.Sandbox + for _, id := range ids { + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandbox %q: %v", id, err) + } + sandboxes = append(sandboxes, s) + } + + switch l.format { + case "text": + // Print a nice table. + w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0) + fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") + for _, s := range sandboxes { + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", + s.ID, + s.Pid, + s.Status, + s.BundleDir, + s.CreatedAt.Format(time.RFC3339Nano), + s.Owner) + } + w.Flush() + case "json": + // Print just the states. + var states []specs.State + for _, s := range sandboxes { + states = append(states, s.State()) + } + if err := json.NewEncoder(os.Stdout).Encode(states); err != nil { + Fatalf("error marshaling sandbox state: %v", err) + } + default: + Fatalf("unknown list format %q", l.format) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/path.go b/runsc/cmd/path.go new file mode 100644 index 000000000..4bb1dbb4f --- /dev/null +++ b/runsc/cmd/path.go @@ -0,0 +1,38 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "os" + "path/filepath" +) + +// absPath turns the given path into an absolute path (if it is not already +// absolute) by prepending the base path. +func absPath(base, rel string) string { + if filepath.IsAbs(rel) { + return rel + } + return filepath.Join(base, rel) +} + +// getwdOrDie returns the current working directory and dies if it cannot. +func getwdOrDie() string { + wd, err := os.Getwd() + if err != nil { + Fatalf("error getting current working directory: %v", err) + } + return wd +} diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go new file mode 100644 index 000000000..a667ec04c --- /dev/null +++ b/runsc/cmd/ps.go @@ -0,0 +1,86 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "fmt" + + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// PS implements subcommands.Command for the "ps" command. +type PS struct { + format string +} + +// Name implements subcommands.Command.Name. +func (*PS) Name() string { + return "ps" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*PS) Synopsis() string { + return "ps displays the processes running inside a container" +} + +// Usage implements subcommands.Command.Usage. +func (*PS) Usage() string { + return "<container-id> [ps options]" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (ps *PS) SetFlags(f *flag.FlagSet) { + f.StringVar(&ps.format, "format", "table", "output format. Select one of: table or json (default: table)") +} + +// Execute implements subcommands.Command.Execute. +func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandox: %v", err) + } + pList, err := s.Processes() + if err != nil { + Fatalf("error getting processes for sandbox: %v", err) + } + + switch ps.format { + case "table": + fmt.Println(control.ProcessListToTable(pList)) + case "json": + o, err := control.PrintPIDsJSON(pList) + if err != nil { + Fatalf("error generating JSON: %v", err) + } + fmt.Println(o) + default: + Fatalf("Unsupported format: %s", ps.format) + } + + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go new file mode 100644 index 000000000..a61a6c73e --- /dev/null +++ b/runsc/cmd/run.go @@ -0,0 +1,82 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "syscall" + + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// Run implements subcommands.Command for the "run" command. +type Run struct { + // Run flags are a super-set of those for Create. + Create +} + +// Name implements subcommands.Command.Name. +func (*Run) Name() string { + return "run" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Run) Synopsis() string { + return "create and run a secure container" +} + +// Usage implements subcommands.Command.Usage. +func (*Run) Usage() string { + return `run [flags] <container id> - create and run a secure container. +` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (r *Run) SetFlags(f *flag.FlagSet) { + r.Create.SetFlags(f) +} + +// Execute implements subcommands.Command.Execute. +func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + waitStatus := args[1].(*syscall.WaitStatus) + + bundleDir := r.bundleDir + if bundleDir == "" { + bundleDir = getwdOrDie() + } + spec, err := specutils.ReadSpec(bundleDir) + if err != nil { + Fatalf("error reading spec: %v", err) + } + + ws, err := sandbox.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, commandLineFlags()) + if err != nil { + Fatalf("error running sandbox: %v", err) + } + + *waitStatus = ws + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go new file mode 100644 index 000000000..a8e132497 --- /dev/null +++ b/runsc/cmd/start.go @@ -0,0 +1,64 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// Start implements subcommands.Command for the "start" command. +type Start struct{} + +// Name implements subcommands.Command.Name. +func (*Start) Name() string { + return "start" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*Start) Synopsis() string { + return "start a secure container" +} + +// Usage implements subcommands.Command.Usage. +func (*Start) Usage() string { + return `start <container id> - start a secure container.` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*Start) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandbox: %v", err) + } + if err := s.Start(conf); err != nil { + Fatalf("error starting sandbox: %v", err) + } + return subcommands.ExitSuccess +} diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go new file mode 100644 index 000000000..0b47f290a --- /dev/null +++ b/runsc/cmd/state.go @@ -0,0 +1,73 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/json" + "os" + + "context" + "flag" + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +// State implements subcommands.Command for the "state" command. +type State struct{} + +// Name implements subcommands.Command.Name. +func (*State) Name() string { + return "state" +} + +// Synopsis implements subcommands.Command.Synopsis. +func (*State) Synopsis() string { + return "get the state of a sandbox" +} + +// Usage implements subcommands.Command.Usage. +func (*State) Usage() string { + return `state [flags] <container id> - get the state of a sandbox` +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*State) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command.Execute. +func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if f.NArg() != 1 { + f.Usage() + return subcommands.ExitUsageError + } + + id := f.Arg(0) + conf := args[0].(*boot.Config) + + s, err := sandbox.Load(conf.RootDir, id) + if err != nil { + Fatalf("error loading sandbox: %v", err) + } + log.Debugf("Returning state %+v", s) + + // Write json-encoded state directly to stdout. + b, err := json.MarshalIndent(s.State(), "", " ") + if err != nil { + Fatalf("error marshaling sandbox state: %v", err) + } + os.Stdout.Write(b) + return subcommands.ExitSuccess +} diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD new file mode 100644 index 000000000..24e172f48 --- /dev/null +++ b/runsc/fsgofer/BUILD @@ -0,0 +1,33 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "fsgofer", + srcs = [ + "fsgofer.go", + "fsgofer_unsafe.go", + ], + importpath = "gvisor.googlesource.com/gvisor/runsc/fsgofer", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/fd", + "//pkg/log", + "//pkg/p9", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "fsgofer_test", + size = "small", + srcs = ["fsgofer_test.go"], + embed = [":fsgofer"], + deps = [ + "//pkg/log", + "//pkg/p9", + ], +) diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go new file mode 100644 index 000000000..5ddc75a9d --- /dev/null +++ b/runsc/fsgofer/fsgofer.go @@ -0,0 +1,937 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsgofer implements p9.File giving access to local files using +// a simple mapping from a path prefix that is added to the path requested +// by the sandbox. Ex: +// +// prefix: "/docker/imgs/alpine" +// app path: /bin/ls => /docker/imgs/alpine/bin/ls +package fsgofer + +import ( + "fmt" + "io" + "math" + "os" + "path" + "path/filepath" + "strings" + "sync" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" +) + +const ( + // invalidMode is set to a value that doesn't match any other valid + // modes to ensure an unopened/closed file fails all mode checks. + invalidMode = p9.OpenFlags(math.MaxUint32) + + openFlags = syscall.O_NOFOLLOW | syscall.O_CLOEXEC +) + +type fileType int + +const ( + regular fileType = iota + directory + symlink +) + +// String implements fmt.Stringer. +func (f fileType) String() string { + switch f { + case regular: + return "regular" + case directory: + return "directory" + case symlink: + return "symlink" + } + return "unknown" +} + +// Config sets configuration options for each attach point. +type Config struct { + // ROMount is set to true if this is a readonly mount. + ROMount bool + + // LazyOpenForWrite makes the underlying file to be opened in RDONLY + // mode initially and be reopened in case write access is desired. + // This is done to workaround the behavior in 'overlay2' that + // copies the entire file up eagerly when it's opened in write mode + // even if the file is never actually written to. + LazyOpenForWrite bool +} + +type attachPoint struct { + prefix string + conf Config +} + +// NewAttachPoint creates a new attacher that gives local file +// access to all files under 'prefix'. +func NewAttachPoint(prefix string, c Config) p9.Attacher { + return &attachPoint{prefix: prefix, conf: c} +} + +// Attach implements p9.Attacher. +func (a *attachPoint) Attach(appPath string) (p9.File, error) { + if !path.IsAbs(appPath) { + return nil, fmt.Errorf("invalid path %q", appPath) + } + + root := filepath.Join(a.prefix, appPath) + f, err := os.OpenFile(root, openFlags|syscall.O_RDONLY, 0) + if err != nil { + return nil, fmt.Errorf("unable to open file %q, err: %v", root, err) + } + stat, err := stat(int(f.Fd())) + if err != nil { + return nil, fmt.Errorf("failed to stat file %q, err: %v", root, err) + } + return newLocalFile(a.conf, f, root, stat) +} + +func makeQID(stat syscall.Stat_t) p9.QID { + return p9.QID{ + Type: p9.FileMode(stat.Mode).QIDType(), + Path: stat.Ino, + } +} + +func isNameValid(name string) bool { + if name == "" || name == "." || name == ".." { + log.Warningf("Invalid name: %s", name) + return false + } + if strings.IndexByte(name, '/') >= 0 { + log.Warningf("Invalid name: %s", name) + return false + } + return true +} + +// localFile implements p9.File wrapping a local file. The underlying file +// is opened during Walk() and stored in 'controlFile' to be used with other +// operations. The mode in which the file is opened varies depending on the +// configuration (see below). 'controlFile' is dup'ed when Walk(nil) is called +// to clone the file. +// +// 'openedFile' is assigned when Open() is called. If requested open mode is +// a subset of controlFile's mode, it's possible to use the same file. If mode +// is not a subset, then another file is opened. Consequently, 'openedFile' +// could have a mode wider than requested and must be verified before read/write +// operations. Before the file is opened and after it's closed, 'mode' is set to +// an invalid value to prevent an unopened file from being used. +// +// localFile has 2 modes of operation based on the configuration: +// +// ** conf.lazyRWOpen == false ** +// This is the preferred mode. 'controlFile' is opened in RW mode in Walk() +// and used across all functions. The file is never reopened as the mode will +// always be a super set of the requested open mode. This reduces the number of +// syscalls required per operation and makes it resilient to renames anywhere +// in the path to the file. +// +// ** conf.lazyRWOpen == true ** +// This mode is used for better performance with 'overlay2' storage driver. +// overlay2 eagerly copies the entire file up when it's opened in write mode +// which makes the mode above perform badly when serveral of files are opened +// for read (esp. startup). In this mode, 'controlFile' is opened as readonly +// (or O_PATH for symlinks). Reopening the file is required if write mode +// is requested in Open(). +type localFile struct { + p9.DefaultWalkGetAttr + + // mu protects 'hostPath' when file is renamed. + mu sync.Mutex + + // TODO: hostPath is not safe to use as path needs to be walked + // everytime (and can change underneath us). Remove all usages. + hostPath string + + // controlFile is opened when localFile is created and it's never nil. + controlFile *os.File + + // openedFile is nil until localFile is opened. It may point to controlFile + // or be a new file struct. See struct comment for more details. + openedFile *os.File + + // mode is the mode in which the file was opened. Set to invalidMode + // if localFile isn't opened. + mode p9.OpenFlags + + ft fileType + + conf Config + + // readDirMu protects against concurrent Readdir calls. + readDirMu sync.Mutex +} + +func openAnyFile(parent *localFile, name string) (*os.File, string, error) { + // Attempt to open file in the following mode in order: + // 1. RDWR: for files with rw mounts and LazyOpenForWrite disabled + // 2. RDONLY: for directories, ro mounts or LazyOpenForWrite enabled + // 3. PATH: for symlinks + modes := []int{syscall.O_RDWR, syscall.O_RDONLY, unix.O_PATH} + symlinkIdx := len(modes) - 1 + + startIdx := 0 + if parent.conf.ROMount || parent.conf.LazyOpenForWrite { + // Skip attempt to open in RDWR based on configuration. + startIdx = 1 + } + + var err error + var fd int + for i := startIdx; i < len(modes); i++ { + fd, err = syscall.Openat(parent.controlFD(), name, openFlags|modes[i], 0) + if err == nil { + // openat succeeded, we're done. + break + } + switch e := extractErrno(err); e { + case syscall.ENOENT: + // File doesn't exist, no point in retrying. + return nil, "", e + case syscall.ELOOP: + if i < symlinkIdx { + // File was opened with O_NOFOLLOW, so this error can only happen when + // trying ot open a symlink. Jump straight to flags compatible with symlink. + i = symlinkIdx - 1 + } + } + // openat failed. Try again with next mode, preserving 'err' in + // case this was the last attempt. + log.Debugf("Attempt %d to open file failed, mode: %#x, path: %s/%s, err: %v", i, openFlags|modes[i], parent.controlFile.Name(), name, err) + } + if err != nil { + // All attempts to open file have failed, return the last error. + log.Debugf("Failed to open file, path: %s/%s, err: %v", parent.controlFile.Name(), name, err) + return nil, "", extractErrno(err) + } + + parent.mu.Lock() + defer parent.mu.Unlock() + newPath := path.Join(parent.hostPath, name) + + return os.NewFile(uintptr(fd), newPath), newPath, nil +} + +func newLocalFile(conf Config, file *os.File, path string, stat syscall.Stat_t) (*localFile, error) { + var ft fileType + switch stat.Mode & syscall.S_IFMT { + case syscall.S_IFREG: + ft = regular + case syscall.S_IFDIR: + ft = directory + case syscall.S_IFLNK: + ft = symlink + default: + return nil, syscall.EINVAL + } + return &localFile{ + hostPath: path, + controlFile: file, + conf: conf, + mode: invalidMode, + ft: ft, + }, nil +} + +// newFDMaybe creates a fd.FD from a file, dup'ing the FD and setting it as +// non-blocking. If anything fails, returns nil. It's better to have a file +// without host FD, than to fail the operation. +func newFDMaybe(file *os.File) *fd.FD { + fd, err := fd.NewFromFile(file) + if err != nil { + return nil + } + + // fd is blocking; non-blocking is required. + if err := syscall.SetNonblock(fd.FD(), true); err != nil { + fd.Close() + return nil + } + return fd +} + +func stat(fd int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Fstat(fd, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + +func fchown(fd int, uid p9.UID, gid p9.GID) error { + return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) +} + +func (l *localFile) controlFD() int { + return int(l.controlFile.Fd()) +} + +func (l *localFile) openedFD() int { + if l.openedFile == nil { + panic(fmt.Sprintf("trying to use an unopened file: %q", l.controlFile.Name())) + } + return int(l.openedFile.Fd()) +} + +// Open implements p9.File. +func (l *localFile) Open(mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + if l.openedFile != nil { + panic(fmt.Sprintf("attempting to open already opened file: %q", l.controlFile.Name())) + } + + // Check if control file can be used or if a new open must be created. + var newFile *os.File + if mode == p9.ReadOnly || !l.conf.LazyOpenForWrite { + log.Debugf("Open reusing control file, mode: %v, %q", mode, l.controlFile.Name()) + newFile = l.controlFile + } else { + // Ideally reopen would call name_to_handle_at (with empty name) and open_by_handle_at + // to reopen the file without using 'hostPath'. However, name_to_handle_at and + // open_by_handle_at aren't supported by overlay2. + log.Debugf("Open reopening file, mode: %v, %q", mode, l.controlFile.Name()) + var err error + + l.mu.Lock() + newFile, err = os.OpenFile(l.hostPath, openFlags|mode.OSFlags(), 0) + if err != nil { + l.mu.Unlock() + return nil, p9.QID{}, 0, extractErrno(err) + } + l.mu.Unlock() + } + + stat, err := stat(int(newFile.Fd())) + if err != nil { + newFile.Close() + return nil, p9.QID{}, 0, extractErrno(err) + } + + var fd *fd.FD + if stat.Mode&syscall.S_IFMT == syscall.S_IFREG { + // Donate FD for regular files only. + fd = newFDMaybe(newFile) + } + + // Set fields on success + l.openedFile = newFile + l.mode = mode + return fd, makeQID(stat), 0, nil +} + +// Create implements p9.File. +func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9.File, p9.QID, uint32, error) { + if l.conf.ROMount { + return nil, nil, p9.QID{}, 0, syscall.EBADF + } + if !isNameValid(name) { + return nil, nil, p9.QID{}, 0, syscall.EINVAL + } + + // Use a single file for both 'controlFile' and 'openedFile'. Mode must include read for control + // and whichever else was requested by caller. Note that resulting file might have a wider mode + // than needed for each particular case. + flags := openFlags | syscall.O_CREAT | syscall.O_EXCL + if mode == p9.WriteOnly { + flags |= syscall.O_RDWR + } else { + flags |= mode.OSFlags() + } + + fd, err := syscall.Openat(l.controlFD(), name, flags, uint32(perm.Permissions())) + if err != nil { + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + if err := fchown(fd, uid, gid); err != nil { + syscall.Close(fd) + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + stat, err := stat(fd) + if err != nil { + syscall.Close(fd) + return nil, nil, p9.QID{}, 0, extractErrno(err) + } + + l.mu.Lock() + defer l.mu.Unlock() + + cPath := path.Join(l.hostPath, name) + f := os.NewFile(uintptr(fd), cPath) + c := &localFile{ + hostPath: cPath, + controlFile: f, + openedFile: f, + mode: mode, + conf: l.conf, + } + return newFDMaybe(c.openedFile), c, makeQID(stat), 0, nil +} + +// Mkdir implements p9.File. +func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + if l.conf.ROMount { + return p9.QID{}, syscall.EBADF + } + + if !isNameValid(name) { + return p9.QID{}, syscall.EINVAL + } + + if err := syscall.Mkdirat(l.controlFD(), name, uint32(perm.Permissions())); err != nil { + return p9.QID{}, extractErrno(err) + } + + // Open directory to change ownership and stat it. + flags := syscall.O_DIRECTORY | syscall.O_RDONLY | openFlags + fd, err := syscall.Openat(l.controlFD(), name, flags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer syscall.Close(fd) + + if err := fchown(fd, uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := stat(fd) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return makeQID(stat), nil +} + +// Walk implements p9.File. +func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { + // Duplicate current file if 'names' is empty. + if len(names) == 0 { + newFd, err := syscall.Dup(l.controlFD()) + if err != nil { + return nil, nil, extractErrno(err) + } + stat, err := stat(newFd) + if err != nil { + syscall.Close(newFd) + return nil, nil, extractErrno(err) + } + + l.mu.Lock() + defer l.mu.Unlock() + + c := &localFile{ + hostPath: l.hostPath, + controlFile: os.NewFile(uintptr(newFd), l.hostPath), + mode: invalidMode, + conf: l.conf, + } + return []p9.QID{makeQID(stat)}, c, nil + } + + var qids []p9.QID + last := l + for _, name := range names { + if !isNameValid(name) { + return nil, nil, syscall.EINVAL + } + + f, path, err := openAnyFile(last, name) + if err != nil { + return nil, nil, extractErrno(err) + } + stat, err := stat(int(f.Fd())) + if err != nil { + return nil, nil, extractErrno(err) + } + c, err := newLocalFile(last.conf, f, path, stat) + if err != nil { + return nil, nil, extractErrno(err) + } + + qids = append(qids, makeQID(stat)) + last = c + } + return qids, last, nil +} + +// StatFS implements p9.File. +func (l *localFile) StatFS() (p9.FSStat, error) { + var s syscall.Statfs_t + if err := syscall.Fstatfs(l.controlFD(), &s); err != nil { + return p9.FSStat{}, extractErrno(err) + } + + // Populate with what's available. + return p9.FSStat{ + Type: uint32(s.Type), + BlockSize: uint32(s.Bsize), + Blocks: s.Blocks, + BlocksFree: s.Bfree, + BlocksAvailable: s.Bavail, + Files: s.Files, + FilesFree: s.Ffree, + NameLength: uint32(s.Namelen), + }, nil +} + +// FSync implements p9.File. +func (l *localFile) FSync() error { + if l.openedFile == nil { + return syscall.EBADF + } + if err := l.openedFile.Sync(); err != nil { + return extractErrno(err) + } + return nil +} + +// GetAttr implements p9.File. +func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + stat, err := stat(l.controlFD()) + if err != nil { + return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) + } + + attr := p9.Attr{ + Mode: p9.FileMode(stat.Mode), + UID: p9.UID(stat.Uid), + GID: p9.GID(stat.Gid), + NLink: stat.Nlink, + RDev: stat.Rdev, + Size: uint64(stat.Size), + BlockSize: uint64(stat.Blksize), + Blocks: uint64(stat.Blocks), + ATimeSeconds: uint64(stat.Atim.Sec), + ATimeNanoSeconds: uint64(stat.Atim.Nsec), + MTimeSeconds: uint64(stat.Mtim.Sec), + MTimeNanoSeconds: uint64(stat.Mtim.Nsec), + CTimeSeconds: uint64(stat.Ctim.Sec), + CTimeNanoSeconds: uint64(stat.Ctim.Nsec), + } + valid := p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + NLink: true, + RDev: true, + Size: true, + Blocks: true, + ATime: true, + MTime: true, + CTime: true, + } + + return makeQID(stat), valid, attr, nil +} + +// SetAttr implements p9.File. Due to mismatch in file API, options +// cannot be changed atomicaly and user may see partial changes when +// an error happens. +func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error { + if l.conf.ROMount { + return syscall.EBADF + } + + allowed := p9.SetAttrMask{ + Permissions: true, + UID: true, + GID: true, + Size: true, + ATime: true, + MTime: true, + ATimeNotSystemTime: true, + MTimeNotSystemTime: true, + } + + if valid.Empty() { + // Nothing to do. + return nil + } + + // Handle all the sanity checks up front so that the client gets a + // consistent result that is not attribute dependent. + if !valid.IsSubsetOf(allowed) { + log.Warningf("SetAttr() failed for %q, mask: %v", l.controlFile.Name(), valid) + return syscall.EPERM + } + + fd := l.controlFD() + if l.conf.LazyOpenForWrite && l.ft == regular { + // Regular files are opened in RO mode when lazy open is set. + // Thus it needs to be reopened here for write. + f, err := os.OpenFile(l.hostPath, openFlags|os.O_WRONLY, 0) + if err != nil { + return extractErrno(err) + } + defer f.Close() + fd = int(f.Fd()) + } + + // The semantics are to either return an error if no changes were made, + // or no error if *all* changes were made. Well, this can be impossible + // if the filesystem rejects at least one of the changes, especially + // since some operations are not easy to undo atomically. + // + // This could be made better if SetAttr actually returned the changes + // it did make, so the client can at least know what has changed. So + // we at least attempt to make all of the changes and return a generic + // error if any of them fails, which at least doesn't bias any change + // over another. + var err error + if valid.Permissions { + if cerr := syscall.Fchmod(fd, uint32(attr.Permissions)); cerr != nil { + log.Debugf("SetAttr fchmod failed %q, err: %v", l.hostPath, cerr) + err = extractErrno(cerr) + } + } + + if valid.Size { + if terr := syscall.Ftruncate(fd, int64(attr.Size)); terr != nil { + log.Debugf("SetAttr ftruncate failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } + + if valid.ATime || valid.MTime { + utimes := [2]syscall.Timespec{ + syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT}, + syscall.Timespec{Sec: 0, Nsec: linux.UTIME_OMIT}, + } + if valid.ATime { + if valid.ATimeNotSystemTime { + utimes[0].Sec = int64(attr.ATimeSeconds) + utimes[0].Nsec = int64(attr.ATimeNanoSeconds) + } else { + utimes[0].Nsec = linux.UTIME_NOW + } + } + if valid.MTime { + if valid.MTimeNotSystemTime { + utimes[1].Sec = int64(attr.MTimeSeconds) + utimes[1].Nsec = int64(attr.MTimeNanoSeconds) + } else { + utimes[1].Nsec = linux.UTIME_NOW + } + } + + if l.ft == symlink { + // utimensat operates different that other syscalls. To operate on a + // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty + // name. + f, err := os.OpenFile(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0) + if err != nil { + return extractErrno(err) + } + defer f.Close() + + if terr := utimensat(int(f.Fd()), path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } else { + // Directories and regular files can operate directly on the fd + // using empty name. + if terr := utimensat(fd, "", utimes, 0); terr != nil { + log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr) + err = extractErrno(terr) + } + } + } + + if valid.UID || valid.GID { + uid := -1 + if valid.UID { + uid = int(attr.UID) + } + gid := -1 + if valid.GID { + gid = int(attr.GID) + } + if oerr := syscall.Fchownat(fd, "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil { + log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr) + err = extractErrno(oerr) + } + } + + return err +} + +// Remove implements p9.File. +// +// This is deprecated in favor of UnlinkAt. +func (*localFile) Remove() error { + return syscall.ENOSYS +} + +// Rename implements p9.File. +func (l *localFile) Rename(directory p9.File, name string) error { + if l.conf.ROMount { + return syscall.EBADF + } + if !isNameValid(name) { + return syscall.EINVAL + } + + l.mu.Lock() + defer l.mu.Unlock() + + // TODO: change to renameat(2) + parent := directory.(*localFile) + newPath := path.Join(parent.hostPath, name) + if err := os.Rename(l.hostPath, newPath); err != nil { + return extractErrno(err) + } + + // Update path on success. + // TODO: this doesn't cover cases where any of the + // parents have been renamed. + l.hostPath = newPath + return nil +} + +// RenameAt implements p9.File.RenameAt. +// +// Code still uses [deprecated] Rename(). +func (*localFile) RenameAt(_ string, _ p9.File, _ string) error { + return syscall.ENOSYS +} + +// ReadAt implements p9.File. +func (l *localFile) ReadAt(p []byte, offset uint64) (int, error) { + if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { + return 0, syscall.EBADF + } + if l.openedFile == nil { + return 0, syscall.EBADF + } + + r, err := l.openedFile.ReadAt(p, int64(offset)) + switch err { + case nil, io.EOF: + return r, nil + default: + return r, extractErrno(err) + } +} + +// WriteAt implements p9.File. +func (l *localFile) WriteAt(p []byte, offset uint64) (int, error) { + if l.mode != p9.WriteOnly && l.mode != p9.ReadWrite { + return 0, syscall.EBADF + } + if l.openedFile == nil { + return 0, syscall.EBADF + } + + w, err := l.openedFile.WriteAt(p, int64(offset)) + if err != nil { + return w, extractErrno(err) + } + return w, nil +} + +// Symlink implements p9.File. +func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + if l.conf.ROMount { + return p9.QID{}, syscall.EBADF + } + if !isNameValid(newName) { + return p9.QID{}, syscall.EINVAL + } + + if err := unix.Symlinkat(target, l.controlFD(), newName); err != nil { + return p9.QID{}, extractErrno(err) + } + + // Open symlink to change ownership and stat it. + fd, err := syscall.Openat(l.controlFD(), newName, unix.O_PATH|openFlags, 0) + if err != nil { + return p9.QID{}, extractErrno(err) + } + defer syscall.Close(fd) + + if err := fchown(fd, uid, gid); err != nil { + return p9.QID{}, extractErrno(err) + } + stat, err := stat(fd) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return makeQID(stat), nil +} + +// Link implements p9.File. +func (l *localFile) Link(target p9.File, newName string) error { + if l.conf.ROMount { + return syscall.EBADF + } + if !isNameValid(newName) { + return syscall.EINVAL + } + + targetFile := target.(*localFile) + if err := unix.Linkat(targetFile.controlFD(), "", l.controlFD(), newName, linux.AT_EMPTY_PATH); err != nil { + return extractErrno(err) + } + return nil +} + +// Mknod implements p9.File. +// +// Not implemented. +func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { + return p9.QID{}, syscall.ENOSYS +} + +// UnlinkAt implements p9.File. +func (l *localFile) UnlinkAt(name string, flags uint32) error { + if l.conf.ROMount { + return syscall.EBADF + } + if !isNameValid(name) { + return syscall.EINVAL + } + if err := unix.Unlinkat(l.controlFD(), name, int(flags)); err != nil { + return extractErrno(err) + } + return nil +} + +// Readdir implements p9.File. +func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) { + if l.mode != p9.ReadOnly && l.mode != p9.ReadWrite { + return nil, syscall.EBADF + } + if l.openedFile == nil { + return nil, syscall.EBADF + } + + // Readdirnames is a cursor over directories, so seek back to 0 to ensure it's + // reading all directory contents. Take a lock because this operation is stateful. + l.readDirMu.Lock() + if _, err := l.openedFile.Seek(0, 0); err != nil { + l.readDirMu.Unlock() + return nil, extractErrno(err) + } + names, err := l.openedFile.Readdirnames(-1) + if err != nil { + l.readDirMu.Unlock() + return nil, extractErrno(err) + } + l.readDirMu.Unlock() + + var dirents []p9.Dirent + for i := int(offset); i >= 0 && i < len(names); i++ { + stat, err := statAt(l.openedFD(), names[i]) + if err != nil { + continue + } + qid := makeQID(stat) + dirents = append(dirents, p9.Dirent{ + QID: qid, + Type: qid.Type, + Name: names[i], + Offset: uint64(i + 1), + }) + } + return dirents, nil +} + +// Readlink implements p9.File. +func (l *localFile) Readlink() (string, error) { + // Shamelessly stolen from os.Readlink (added upper bound limit to buffer). + for len := 128; len < 1024*1024; len *= 2 { + b := make([]byte, len) + n, err := unix.Readlinkat(l.controlFD(), "", b) + if err != nil { + return "", extractErrno(err) + } + if n < len { + return string(b[:n]), nil + } + } + return "", syscall.ENOMEM +} + +// Flush implements p9.File. +func (l *localFile) Flush() error { + return nil +} + +// Connect implements p9.File. +func (l *localFile) Connect(p9.ConnectFlags) (*fd.FD, error) { + return nil, syscall.ECONNREFUSED +} + +// Close implements p9.File. +func (l *localFile) Close() error { + err := l.controlFile.Close() + + // Close only once in case opened and control files point to + // the same os.File struct. + if l.openedFile != nil && l.openedFile != l.controlFile { + err = l.openedFile.Close() + } + + l.openedFile = nil + l.controlFile = nil + l.mode = invalidMode + return err +} + +// extractErrno tries to determine the errno. +func extractErrno(err error) syscall.Errno { + if err == nil { + // This should never happen. The likely result will be that + // some user gets the frustration "error: SUCCESS" message. + log.Warningf("extractErrno called with nil error!") + return 0 + } + + switch err { + case os.ErrNotExist: + return syscall.ENOENT + case os.ErrExist: + return syscall.EEXIST + case os.ErrPermission: + return syscall.EACCES + case os.ErrInvalid: + return syscall.EINVAL + } + + // See if it's an errno or a common wrapped error. + switch e := err.(type) { + case syscall.Errno: + return e + case *os.PathError: + return extractErrno(e.Err) + case *os.LinkError: + return extractErrno(e.Err) + case *os.SyscallError: + return extractErrno(e.Err) + } + + // Fall back to EIO. + log.Debugf("Unknown error: %v, defaulting to EIO", err) + return syscall.EIO +} diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go new file mode 100644 index 000000000..7d834d596 --- /dev/null +++ b/runsc/fsgofer/fsgofer_test.go @@ -0,0 +1,576 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "fmt" + "io/ioutil" + "os" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" +) + +func init() { + log.SetLevel(log.Debug) + + allConfs = append(allConfs, rwConfs...) + allConfs = append(allConfs, roConfs...) +} + +var ( + allTypes = []fileType{regular, directory, symlink} + + // allConfs is set in init() above. + allConfs []Config + + rwConfs = []Config{ + Config{ROMount: false, LazyOpenForWrite: false}, + Config{ROMount: false, LazyOpenForWrite: true}, + } + roConfs = []Config{ + Config{ROMount: true, LazyOpenForWrite: false}, + Config{ROMount: true, LazyOpenForWrite: true}, + } +) + +type state struct { + root *localFile + file *localFile + conf Config + ft fileType +} + +func (s state) String() string { + return fmt.Sprintf("lazyopen(%v)-%v", s.conf.LazyOpenForWrite, s.ft) +} + +func runAll(t *testing.T, test func(*testing.T, state)) { + runCustom(t, allTypes, allConfs, test) +} + +func runCustom(t *testing.T, types []fileType, confs []Config, test func(*testing.T, state)) { + for _, c := range confs { + t.Logf("Config: %+v", c) + + for _, ft := range types { + t.Logf("File type: %v", ft) + + path, name, err := setup(ft) + if err != nil { + t.Fatalf("%v", err) + } + defer os.RemoveAll(path) + + a := NewAttachPoint(path, c) + root, err := a.Attach("/") + if err != nil { + t.Fatalf("Attach(%q) failed, err: %v", "/", err) + } + + _, file, err := root.Walk([]string{name}) + if err != nil { + root.Close() + t.Fatalf("root.Walk({%q}) failed, err: %v", "symlink", err) + } + + st := state{root: root.(*localFile), file: file.(*localFile), conf: c, ft: ft} + test(t, st) + file.Close() + root.Close() + } + } +} + +func setup(ft fileType) (string, string, error) { + path, err := ioutil.TempDir("", "root-") + if err != nil { + return "", "", fmt.Errorf("ioutil.TempDir() failed, err: %v", err) + } + + // First attach with writable configuiration to setup tree. + a := NewAttachPoint(path, Config{}) + root, err := a.Attach("/") + if err != nil { + return "", "", fmt.Errorf("Attach(%q) failed, err: %v", "/", err) + } + defer root.Close() + + var name string + switch ft { + case regular: + name = "file" + _, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err) + } + defer f.Close() + case directory: + name = "dir" + if _, err := root.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + return "", "", fmt.Errorf("root.MkDir(%q) failed, err: %v", name, err) + } + case symlink: + name = "symlink" + if _, err := root.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + return "", "", fmt.Errorf("root.Symlink(%q) failed, err: %v", name, err) + } + default: + panic(fmt.Sprintf("unknown file type %v", ft)) + } + return path, name, nil +} + +func createFile(dir *localFile, name string) (*localFile, error) { + _, f, _, _, err := dir.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + return nil, err + } + return f.(*localFile), nil +} + +func TestReadWrite(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + child, err := createFile(s.file, "test") + if err != nil { + t.Fatalf("%v: createFile() failed, err: %v", s, err) + } + defer child.Close() + b := []byte("foobar") + w, err := child.WriteAt(b, 0) + if err != nil { + t.Fatalf("%v: Write() failed, err: %v", s, err) + } + if w != len(b) { + t.Fatalf("%v: Write() was partial, got: %d, expected: %d", s, w, len(b)) + } + for _, test := range []struct { + flags p9.OpenFlags + read bool + write bool + }{ + {flags: p9.ReadOnly, read: true, write: false}, + {flags: p9.WriteOnly, read: false, write: true}, + {flags: p9.ReadWrite, read: true, write: true}, + } { + _, l, err := s.file.Walk([]string{"test"}) + if err != nil { + t.Fatalf("%v: Walk(%s) failed, err: %v", s, "test", err) + } + if _, _, _, err := l.Open(test.flags); err != nil { + t.Fatalf("%v: Open(%v) failed, err: %v", s, test.flags, err) + } + + w, err = l.WriteAt(b, 0) + if test.write { + if err != nil { + t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err) + } + if w != len(b) { + t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b)) + } + } else { + if err == nil { + t.Fatalf("%v, %v: WriteAt() should have failed", s, test.flags) + } + } + + rBuf := make([]byte, len(b)) + r, err := l.ReadAt(rBuf, 0) + if test.read { + if err != nil { + t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err) + } + if r != len(rBuf) { + t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf)) + } + if string(rBuf) != "foobar" { + t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar") + } + } else { + if err == nil { + t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags) + } + } + } + }) +} + +func TestCreate(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + for i, test := range []struct { + flags p9.OpenFlags + read bool + }{ + {flags: p9.WriteOnly, read: false}, + {flags: p9.ReadWrite, read: true}, + } { + _, l, _, _, err := s.file.Create(fmt.Sprintf("test-%d", i), test.flags, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err) + } + + b := []byte("foobar") + w, err := l.WriteAt(b, 0) + if err != nil { + t.Fatalf("%v, %v: WriteAt() failed, err: %v", s, test.flags, err) + } + if w != len(b) { + t.Fatalf("%v, %v: WriteAt() was partial, got: %d, expected: %d", s, test.flags, w, len(b)) + } + + rBuf := make([]byte, len(b)) + r, err := l.ReadAt(rBuf, 0) + if test.read { + if err != nil { + t.Fatalf("%v, %v: ReadAt() failed, err: %v", s, test.flags, err) + } + if r != len(rBuf) { + t.Fatalf("%v, %v: ReadAt() was partial, got: %d, expected: %d", s, test.flags, r, len(rBuf)) + } + if string(rBuf) != "foobar" { + t.Fatalf("%v, %v: ReadAt() wrong data, got: %s, expected: %s", s, test.flags, string(rBuf), "foobar") + } + } else { + if err == nil { + t.Fatalf("%v, %v: ReadAt() should have failed", s, test.flags) + } + } + } + }) +} + +func TestUnopened(t *testing.T) { + runCustom(t, []fileType{regular}, allConfs, func(t *testing.T, s state) { + b := []byte("foobar") + if _, err := s.file.WriteAt(b, 0); err != syscall.EBADF { + t.Errorf("%v: WriteAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.ReadAt(b, 0); err != syscall.EBADF { + t.Errorf("%v: ReadAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.Readdir(0, 100); err != syscall.EBADF { + t.Errorf("%v: Readdir() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.FSync(); err != syscall.EBADF { + t.Errorf("%v: FSync() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + }) +} + +func SetGetAttr(l *localFile, valid p9.SetAttrMask, attr p9.SetAttr) (p9.Attr, error) { + if err := l.SetAttr(valid, attr); err != nil { + return p9.Attr{}, err + } + _, _, a, err := l.GetAttr(p9.AttrMask{}) + if err != nil { + return p9.Attr{}, err + } + return a, nil +} + +func TestSetAttrPerm(t *testing.T) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + valid := p9.SetAttrMask{Permissions: true} + attr := p9.SetAttr{Permissions: 0777} + got, err := SetGetAttr(s.file, valid, attr) + if s.ft == symlink { + if err == nil { + t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) + } + } else { + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Permissions, err) + } + if got.Mode.Permissions() != attr.Permissions { + t.Errorf("%v: wrong permission, got: %v, expected: %v", s, got.Mode.Permissions(), attr.Permissions) + } + } + }) +} + +func TestSetAttrSize(t *testing.T) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + for _, size := range []uint64{1024, 0, 1024 * 1024} { + valid := p9.SetAttrMask{Size: true} + attr := p9.SetAttr{Size: size} + got, err := SetGetAttr(s.file, valid, attr) + if s.ft == symlink || s.ft == directory { + if err == nil { + t.Fatalf("%v: SetGetAttr(valid, %v) should have failed", s, attr.Permissions) + } + // Run for one size only, they will all fail the same way. + return + } + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.Size, err) + } + if got.Size != size { + t.Errorf("%v: wrong size, got: %v, expected: %v", s, got.Size, size) + } + } + }) +} + +func TestSetAttrTime(t *testing.T) { + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + valid := p9.SetAttrMask{ATime: true, ATimeNotSystemTime: true} + attr := p9.SetAttr{ATimeSeconds: 123, ATimeNanoSeconds: 456} + got, err := SetGetAttr(s.file, valid, attr) + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.ATimeSeconds, attr.ATimeNanoSeconds, err) + } + if got.ATimeSeconds != 123 { + t.Errorf("%v: wrong ATimeSeconds, got: %v, expected: %v", s, got.ATimeSeconds, 123) + } + if got.ATimeNanoSeconds != 456 { + t.Errorf("%v: wrong ATimeNanoSeconds, got: %v, expected: %v", s, got.ATimeNanoSeconds, 456) + } + + valid = p9.SetAttrMask{MTime: true, MTimeNotSystemTime: true} + attr = p9.SetAttr{MTimeSeconds: 789, MTimeNanoSeconds: 012} + got, err = SetGetAttr(s.file, valid, attr) + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v:%v) failed, err: %v", s, attr.MTimeSeconds, attr.MTimeNanoSeconds, err) + } + if got.MTimeSeconds != 789 { + t.Errorf("%v: wrong MTimeSeconds, got: %v, expected: %v", s, got.MTimeSeconds, 789) + } + if got.MTimeNanoSeconds != 012 { + t.Errorf("%v: wrong MTimeNanoSeconds, got: %v, expected: %v", s, got.MTimeNanoSeconds, 012) + } + }) +} + +func TestSetAttrOwner(t *testing.T) { + if os.Getuid() != 0 { + t.Skipf("SetAttr(owner) test requires CAP_CHOWN, running as %d", os.Getuid()) + } + + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + newUID := os.Getuid() + 1 + valid := p9.SetAttrMask{UID: true} + attr := p9.SetAttr{UID: p9.UID(newUID)} + got, err := SetGetAttr(s.file, valid, attr) + if err != nil { + t.Fatalf("%v: SetGetAttr(valid, %v) failed, err: %v", s, attr.UID, err) + } + if got.UID != p9.UID(newUID) { + t.Errorf("%v: wrong uid, got: %v, expected: %v", s, got.UID, newUID) + } + }) +} + +func TestLink(t *testing.T) { + if os.Getuid() != 0 { + t.Skipf("Link test requires CAP_DAC_READ_SEARCH, running as %d", os.Getuid()) + } + runCustom(t, allTypes, rwConfs, func(t *testing.T, s state) { + const dirName = "linkdir" + const linkFile = "link" + if _, err := s.root.Mkdir(dirName, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("%v: MkDir(%s) failed, err: %v", s, dirName, err) + } + _, dir, err := s.root.Walk([]string{dirName}) + if err != nil { + t.Fatalf("%v: Walk({%s}) failed, err: %v", s, dirName, err) + } + + err = dir.Link(s.file, linkFile) + if s.ft == directory { + if err != syscall.EPERM { + t.Errorf("%v: Link(target, %s) should have failed, got: %v, expected: syscall.EPERM", s, linkFile, err) + } + return + } + if err != nil { + t.Errorf("%v: Link(target, %s) failed, err: %v", s, linkFile, err) + } + }) +} + +func TestROMountChecks(t *testing.T) { + runCustom(t, allTypes, roConfs, func(t *testing.T, s state) { + if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { + t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { + t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.Rename(s.file, ".."); err != syscall.EBADF { + t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EBADF { + t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.UnlinkAt("..", 0); err != syscall.EBADF { + t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + if err := s.file.Link(s.file, ".."); err != syscall.EBADF { + t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + + valid := p9.SetAttrMask{Size: true} + attr := p9.SetAttr{Size: 0} + if err := s.file.SetAttr(valid, attr); err != syscall.EBADF { + t.Errorf("%v: SetAttr() should have failed, got: %v, expected: syscall.EBADF", s, err) + } + }) +} + +func TestInvalidName(t *testing.T) { + runCustom(t, []fileType{regular}, rwConfs, func(t *testing.T, s state) { + if _, _, _, _, err := s.file.Create("..", p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL { + t.Errorf("%v: Create() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + if _, _, err := s.file.Walk([]string{".."}); err != syscall.EINVAL { + t.Errorf("%v: Walk() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + if _, err := s.file.Mkdir("..", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL { + t.Errorf("%v: MkDir() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + if err := s.file.Rename(s.file, ".."); err != syscall.EINVAL { + t.Errorf("%v: Rename() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + if _, err := s.file.Symlink("some_place", "..", p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != syscall.EINVAL { + t.Errorf("%v: Symlink() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + if err := s.file.UnlinkAt("..", 0); err != syscall.EINVAL { + t.Errorf("%v: UnlinkAt() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + if err := s.file.Link(s.file, ".."); err != syscall.EINVAL { + t.Errorf("%v: Link() should have failed, got: %v, expected: syscall.EINVAL", s, err) + } + }) +} + +func TestIsNameValid(t *testing.T) { + valid := []string{ + "name", + "123", + "!@#$%^&*()", + ".name", + "..name", + "...", + } + for _, s := range valid { + if got := isNameValid(s); !got { + t.Errorf("isNameValid(%s) failed, got: %v, expected: true", s, got) + } + } + invalid := []string{ + ".", + "..", + "name/name", + "/name", + "name/", + } + for _, s := range invalid { + if got := isNameValid(s); got { + t.Errorf("isNameValid(%s) failed, got: %v, expected: false", s, got) + } + } +} + +func TestWalkNotFound(t *testing.T) { + runCustom(t, []fileType{directory}, allConfs, func(t *testing.T, s state) { + if _, _, err := s.file.Walk([]string{"nobody-here"}); err != syscall.ENOENT { + t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: syscall.ENOENT", s, "nobody-here", err) + } + }) +} + +func TestWalkDup(t *testing.T) { + runAll(t, func(t *testing.T, s state) { + _, dup, err := s.file.Walk([]string{}) + if err != nil { + t.Fatalf("%v: Walk(nil) failed, err: %v", s, err) + } + // Check that 'dup' is usable. + if _, _, _, err := dup.GetAttr(p9.AttrMask{}); err != nil { + t.Errorf("%v: GetAttr() failed, err: %v", s, err) + } + }) +} + +func TestReaddir(t *testing.T) { + runCustom(t, []fileType{directory}, rwConfs, func(t *testing.T, s state) { + name := "dir" + if _, err := s.file.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("%v: MkDir(%s) failed, err: %v", s, name, err) + } + name = "symlink" + if _, err := s.file.Symlink("/some/target", name, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil { + t.Fatalf("%v: Symlink(%q) failed, err: %v", s, name, err) + } + name = "file" + _, f, _, _, err := s.file.Create(name, p9.ReadWrite, 0555, p9.UID(os.Getuid()), p9.GID(os.Getgid())) + if err != nil { + t.Fatalf("%v: createFile(root, %q) failed, err: %v", s, name, err) + } + f.Close() + + if _, _, _, err := s.file.Open(p9.ReadOnly); err != nil { + t.Fatalf("%v: Open(ReadOnly) failed, err: %v", s, err) + } + + dirents, err := s.file.Readdir(0, 10) + if err != nil { + t.Fatalf("%v: Readdir(0, 10) failed, err: %v", s, err) + } + if len(dirents) != 3 { + t.Fatalf("%v: Readdir(0, 10) wrong number of items, got: %v, expected: 3", s, len(dirents)) + } + var dir, symlink, file bool + for _, d := range dirents { + switch d.Name { + case "dir": + if d.Type != p9.TypeDir { + t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeDir) + } + dir = true + case "symlink": + if d.Type != p9.TypeSymlink { + t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeSymlink) + } + symlink = true + case "file": + if d.Type != p9.TypeRegular { + t.Errorf("%v: dirent.Type got: %v, expected: %v", s, d.Type, p9.TypeRegular) + } + file = true + default: + t.Errorf("%v: dirent.Name got: %v", s, d.Name) + } + + _, f, err := s.file.Walk([]string{d.Name}) + if err != nil { + t.Fatalf("%v: Walk({%s}) failed, err: %v", s, d.Name, err) + } + _, _, a, err := f.GetAttr(p9.AttrMask{}) + if err != nil { + t.Fatalf("%v: GetAttr() failed, err: %v", s, err) + } + if d.Type != a.Mode.QIDType() { + t.Errorf("%v: dirent.Type different than GetAttr().Mode.QIDType(), got: %v, expected: %v", s, d.Type, a.Mode.QIDType()) + } + } + if !dir || !symlink || !file { + t.Errorf("%v: Readdir(0, 10) wrong files returned, dir: %v, symlink: %v, file: %v", s, dir, symlink, file) + } + }) +} diff --git a/runsc/fsgofer/fsgofer_unsafe.go b/runsc/fsgofer/fsgofer_unsafe.go new file mode 100644 index 000000000..e676809ac --- /dev/null +++ b/runsc/fsgofer/fsgofer_unsafe.go @@ -0,0 +1,58 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsgofer + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +func statAt(dirFd int, name string) (syscall.Stat_t, error) { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return syscall.Stat_t{}, extractErrno(err) + } + namePtr := uintptr(unsafe.Pointer(nameBytes)) + + var stat syscall.Stat_t + statPtr := uintptr(unsafe.Pointer(&stat)) + + if _, _, err := syscall.Syscall6(syscall.SYS_NEWFSTATAT, uintptr(dirFd), namePtr, statPtr, linux.AT_SYMLINK_NOFOLLOW, 0, 0); err != 0 { + return syscall.Stat_t{}, err + } + return stat, nil +} + +func utimensat(dirFd int, name string, times [2]syscall.Timespec, flags int) error { + // utimensat(2) doesn't accept empty name, instead name must be nil to make it + // operate directly on 'dirFd' unlike other *at syscalls. + var namePtr uintptr + if name != "" { + nameBytes, err := syscall.BytePtrFromString(name) + if err != nil { + return extractErrno(err) + } + namePtr = uintptr(unsafe.Pointer(nameBytes)) + } + + timesPtr := uintptr(unsafe.Pointer(×[0])) + + if _, _, err := syscall.Syscall6(syscall.SYS_UTIMENSAT, uintptr(dirFd), namePtr, timesPtr, uintptr(flags), 0, 0); err != 0 { + return err + } + return nil +} diff --git a/runsc/main.go b/runsc/main.go new file mode 100644 index 000000000..cf4b99d3f --- /dev/null +++ b/runsc/main.go @@ -0,0 +1,199 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary runsc is an implementation of the Open Container Initiative Runtime +// that runs applications inside a sandbox. +package main + +import ( + "fmt" + "io" + "os" + "path/filepath" + "strings" + "syscall" + "time" + + "context" + "flag" + + "github.com/google/subcommands" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/cmd" +) + +var ( + // Although these flags are not part of the OCI spec, they are used by + // Docker, and thus should not be changed. + rootDir = flag.String("root", "", "root directory for storage of container state") + logFilename = flag.String("log", "", "file path where internal debug information is written, default is stdout") + logFormat = flag.String("log-format", "text", "log format: text (default) or json") + debug = flag.Bool("debug", false, "enable debug logging") + + // These flags are unique to runsc, and are used to configure parts of the + // system that are not covered by the runtime spec. + + // Debugging flags. + debugLogDir = flag.String("debug-log-dir", "", "additional location for logs. It creates individual log files per command") + logPackets = flag.Bool("log-packets", false, "enable network packet logging") + + // Debugging flags: strace related + strace = flag.Bool("strace", false, "enable strace") + straceSyscalls = flag.String("strace-syscalls", "", "comma-separated list of syscalls to trace. If --strace is true and this list is empty, then all syscalls will be traced.") + straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs") + + // Flags that control sandbox runtime behavior. + platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm") + network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.") + fileAccess = flag.String("file-access", "proxy", "specifies which filesystem to use: proxy (default), direct. Using a proxy is more secure because it disallows the sandbox from opennig files directly in the host.") + overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") +) + +func main() { + // Help and flags commands are generated automatically. + subcommands.Register(subcommands.HelpCommand(), "") + subcommands.Register(subcommands.FlagsCommand(), "") + + // Register user-facing runsc commands. + subcommands.Register(new(cmd.Create), "") + subcommands.Register(new(cmd.Delete), "") + subcommands.Register(new(cmd.Events), "") + subcommands.Register(new(cmd.Exec), "") + subcommands.Register(new(cmd.Gofer), "") + subcommands.Register(new(cmd.Kill), "") + subcommands.Register(new(cmd.List), "") + subcommands.Register(new(cmd.PS), "") + subcommands.Register(new(cmd.Run), "") + subcommands.Register(new(cmd.Start), "") + subcommands.Register(new(cmd.State), "") + + // Register internal commands with the internal group name. This causes + // them to be sorted below the user-facing commands with empty group. + // The string below will be printed above the commands. + const internalGroup = "internal use only" + subcommands.Register(new(cmd.Boot), internalGroup) + subcommands.Register(new(cmd.Gofer), internalGroup) + + // All subcommands must be registered before flag parsing. + flag.Parse() + + platformType, err := boot.MakePlatformType(*platform) + if err != nil { + cmd.Fatalf("%v", err) + } + + fsAccess, err := boot.MakeFileAccessType(*fileAccess) + if err != nil { + cmd.Fatalf("%v", err) + } + + netType, err := boot.MakeNetworkType(*network) + if err != nil { + cmd.Fatalf("%v", err) + } + + // Create a new Config from the flags. + conf := &boot.Config{ + RootDir: *rootDir, + FileAccess: fsAccess, + Overlay: *overlay, + Network: netType, + LogPackets: *logPackets, + Platform: platformType, + Strace: *strace, + StraceLogSize: *straceLogSize, + } + if len(*straceSyscalls) != 0 { + conf.StraceSyscalls = strings.Split(*straceSyscalls, ",") + } + + // Set up logging. + if *debug { + log.SetLevel(log.Debug) + } + + var logFile io.Writer = os.Stderr + if *logFilename != "" { + f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + cmd.Fatalf("error opening log file %q: %v", *logFilename, err) + } + logFile = f + } + + var e log.Emitter + switch *logFormat { + case "text": + e = log.GoogleEmitter{&log.Writer{Next: logFile}} + case "json": + e = log.JSONEmitter{log.Writer{Next: logFile}} + default: + cmd.Fatalf("invalid log format %q, must be 'json' or 'text'", *logFormat) + } + + if *debugLogDir != "" { + if err := os.MkdirAll(*debugLogDir, 0775); err != nil { + cmd.Fatalf("error creating dir %q: %v", *debugLogDir, err) + } + + // Format: <debug-log-dir>/runsc.log.<yyymmdd-hhmmss.uuuuuu>.<command> + scmd := flag.CommandLine.Arg(0) + filename := fmt.Sprintf("runsc.log.%s.%s", time.Now().Format("20060102-150405.000000"), scmd) + path := filepath.Join(*debugLogDir, filename) + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) + if err != nil { + cmd.Fatalf("error opening log file %q: %v", filename, err) + } + e = log.MultiEmitter{e, log.GoogleEmitter{&log.Writer{Next: f}}} + } + + log.SetTarget(e) + + log.Infof("***************************") + log.Infof("Args: %s", os.Args) + log.Infof("PID: %d", os.Getpid()) + log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid()) + log.Infof("Configuration:") + log.Infof("\t\tRootDir: %s", conf.RootDir) + log.Infof("\t\tPlatform: %v", conf.Platform) + log.Infof("\t\tFileAccess: %v, overlay: %t", conf.FileAccess, conf.Overlay) + log.Infof("\t\tNetwork: %v, logging: %t", conf.Network, conf.LogPackets) + log.Infof("\t\tStrace: %t, max size: %d, syscalls: %s", conf.Strace, conf.StraceLogSize, conf.StraceSyscalls) + log.Infof("***************************") + + // Call the subcommand and pass in the configuration. + var ws syscall.WaitStatus + subcmdCode := subcommands.Execute(context.Background(), conf, &ws) + if subcmdCode == subcommands.ExitSuccess { + log.Infof("Exiting with status: %v", ws) + if ws.Signaled() { + // No good way to return it, emulate what the shell does. Maybe raise + // signall to self? + os.Exit(128 + int(ws.Signal())) + } + os.Exit(ws.ExitStatus()) + } + // Return an error that is unlikely to be used by the application. + log.Warningf("Failure to execute command, err: %v", subcmdCode) + os.Exit(128) +} + +func init() { + // Set default root dir to something (hopefully) user-writeable. + *rootDir = "/var/run/runsc" + if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { + *rootDir = filepath.Join(runtimeDir, "runsc") + } +} diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD new file mode 100644 index 000000000..bdd95903e --- /dev/null +++ b/runsc/sandbox/BUILD @@ -0,0 +1,53 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "sandbox", + srcs = [ + "console.go", + "hook.go", + "namespace.go", + "network.go", + "sandbox.go", + "status.go", + ], + importpath = "gvisor.googlesource.com/gvisor/runsc/sandbox", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/control/client", + "//pkg/control/server", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/urpc", + "//runsc/boot", + "//runsc/specutils", + "@com_github_kr_pty//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@com_github_vishvananda_netlink//:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "sandbox_test", + size = "small", + srcs = ["sandbox_test.go"], + pure = "on", + rundir = ".", + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/control", + "//pkg/sentry/kernel/auth", + "//pkg/unet", + "//runsc/boot", + "//runsc/cmd", + "//runsc/sandbox", + "@com_github_google_subcommands//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/runsc/sandbox/console.go b/runsc/sandbox/console.go new file mode 100644 index 000000000..3f133e12a --- /dev/null +++ b/runsc/sandbox/console.go @@ -0,0 +1,60 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "net" + "os" + + "github.com/kr/pty" + "golang.org/x/sys/unix" +) + +// setupConsole creates pty master/slave pair, sends the master FD over the +// given socket, and returns the slave. +func setupConsole(socketPath string) (*os.File, error) { + // Create a new pty master and slave. + ptyMaster, ptySlave, err := pty.Open() + if err != nil { + return nil, fmt.Errorf("error opening pty: %v", err) + } + defer ptyMaster.Close() + + // Get a connection to the socket path. + conn, err := net.Dial("unix", socketPath) + if err != nil { + ptySlave.Close() + return nil, fmt.Errorf("error dial socket %q: %v", socketPath, err) + } + uc, ok := conn.(*net.UnixConn) + if !ok { + ptySlave.Close() + return nil, fmt.Errorf("connection is not a UnixConn: %T", conn) + } + socket, err := uc.File() + if err != nil { + ptySlave.Close() + return nil, fmt.Errorf("error getting file for unix socket %v: %v", uc, err) + } + + // Send the master FD over the connection. + msg := unix.UnixRights(int(ptyMaster.Fd())) + if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil { + ptySlave.Close() + return nil, fmt.Errorf("error sending console over unix socket %q: %v", socketPath, err) + } + return ptySlave, nil +} diff --git a/runsc/sandbox/hook.go b/runsc/sandbox/hook.go new file mode 100644 index 000000000..40b064cdc --- /dev/null +++ b/runsc/sandbox/hook.go @@ -0,0 +1,111 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "path/filepath" + "strings" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// This file implements hooks as defined in OCI spec: +// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22 +// +// "hooks":{ +// "prestart":[{ +// "path":"/usr/bin/dockerd", +// "args":[ +// "libnetwork-setkey", "arg2", +// ] +// }] +// }, + +// executeHooksBestEffort executes hooks and logs warning in case they fail. +// Runs all hooks, always. +func executeHooksBestEffort(hooks []specs.Hook, s specs.State) { + for _, h := range hooks { + if err := executeHook(h, s); err != nil { + log.Warningf("Failure to execute hook %+v, err: %v", h, err) + } + } +} + +// executeHooks executes hooks until the first one fails or they all execute. +func executeHooks(hooks []specs.Hook, s specs.State) error { + for _, h := range hooks { + if err := executeHook(h, s); err != nil { + return err + } + } + return nil +} + +func executeHook(h specs.Hook, s specs.State) error { + log.Debugf("Executing hook %+v, state: %+v", h, s) + + if strings.TrimSpace(h.Path) == "" { + return fmt.Errorf("empty path for hook") + } + if !filepath.IsAbs(h.Path) { + return fmt.Errorf("path for hook is not absolute: %q", h.Path) + } + + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + + c := make(chan error, 1) + go func() { + c <- cmd.Wait() + }() + + var timer <-chan time.Time + if h.Timeout != nil { + timer = time.After(time.Duration(*h.Timeout) * time.Second) + } + select { + case err := <-c: + if err != nil { + return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String()) + } + case <-timer: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String()) + } + + log.Debugf("Execute hook %q success!", h.Path) + return nil +} diff --git a/runsc/sandbox/namespace.go b/runsc/sandbox/namespace.go new file mode 100644 index 000000000..1d3bcfbb5 --- /dev/null +++ b/runsc/sandbox/namespace.go @@ -0,0 +1,204 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// nsCloneFlag returns the clone flag that can be used to set a namespace of +// the given type. +func nsCloneFlag(nst specs.LinuxNamespaceType) uintptr { + switch nst { + case specs.IPCNamespace: + return syscall.CLONE_NEWIPC + case specs.MountNamespace: + return syscall.CLONE_NEWNS + case specs.NetworkNamespace: + return syscall.CLONE_NEWNET + case specs.PIDNamespace: + return syscall.CLONE_NEWPID + case specs.UTSNamespace: + return syscall.CLONE_NEWUTS + case specs.UserNamespace: + return syscall.CLONE_NEWUSER + case specs.CgroupNamespace: + panic("cgroup namespace has no associated clone flag") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// nsPath returns the path of the namespace for the current process and the +// given namespace. +func nsPath(nst specs.LinuxNamespaceType) string { + base := "/proc/self/ns" + switch nst { + case specs.CgroupNamespace: + return filepath.Join(base, "cgroup") + case specs.IPCNamespace: + return filepath.Join(base, "ipc") + case specs.MountNamespace: + return filepath.Join(base, "mnt") + case specs.NetworkNamespace: + return filepath.Join(base, "net") + case specs.PIDNamespace: + return filepath.Join(base, "pid") + case specs.UserNamespace: + return filepath.Join(base, "user") + case specs.UTSNamespace: + return filepath.Join(base, "uts") + default: + panic(fmt.Sprintf("unknown namespace %v", nst)) + } +} + +// getNS returns true and the namespace with the given type from the slice of +// namespaces in the spec. It returns false if the slice does not contain a +// namespace with the type. +func getNS(nst specs.LinuxNamespaceType, s *specs.Spec) (specs.LinuxNamespace, bool) { + if s.Linux == nil { + return specs.LinuxNamespace{}, false + } + for _, ns := range s.Linux.Namespaces { + if ns.Type == nst { + return ns, true + } + } + return specs.LinuxNamespace{}, false +} + +// filterNS returns a slice of namespaces from the spec with types that match +// those in the `filter` slice. +func filterNS(filter []specs.LinuxNamespaceType, s *specs.Spec) []specs.LinuxNamespace { + if s.Linux == nil { + return nil + } + var out []specs.LinuxNamespace + for _, nst := range filter { + if ns, ok := getNS(nst, s); ok { + out = append(out, ns) + } + } + return out +} + +// setNS sets the namespace of the given type. It must be called with +// OSThreadLocked. +func setNS(fd, nsType uintptr) error { + if _, _, err := syscall.RawSyscall(unix.SYS_SETNS, fd, nsType, 0); err != 0 { + return err + } + return nil +} + +// applyNS applies the namespace on the current thread and returns a function +// that will restore the namespace to the original value. +// +// Preconditions: Must be called with os thread locked. +func applyNS(ns specs.LinuxNamespace) (func(), error) { + log.Infof("applying namespace %v at path %q", ns.Type, ns.Path) + newNS, err := os.Open(ns.Path) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", ns.Path, err) + } + defer newNS.Close() + + // Store current netns to restore back after child is started. + curPath := nsPath(ns.Type) + oldNS, err := os.Open(curPath) + if err != nil { + return nil, fmt.Errorf("error opening %q: %v", curPath, err) + } + + // Set netns to the one requested and setup function to restore it back. + flag := nsCloneFlag(ns.Type) + if err := setNS(newNS.Fd(), flag); err != nil { + oldNS.Close() + return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err) + } + return func() { + log.Infof("restoring namespace %v", ns.Type) + defer oldNS.Close() + if err := setNS(oldNS.Fd(), flag); err != nil { + panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err)) + } + }, nil +} + +// startInNS joins or creates the given namespaces and calls cmd.Start before +// restoring the namespaces to the original values. +func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error { + // We are about to setup namespaces, which requires the os thread being + // locked so that Go doesn't change the thread out from under us. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + + for _, ns := range nss { + if ns.Path == "" { + // No path. Just set a flag to create a new namespace. + cmd.SysProcAttr.Cloneflags |= nsCloneFlag(ns.Type) + continue + } + // Join the given namespace, and restore the current namespace + // before exiting. + restoreNS, err := applyNS(ns) + if err != nil { + return err + } + defer restoreNS() + } + + return cmd.Start() +} + +// setUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd. +func setUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) { + if s.Linux == nil { + return + } + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + for _, idMap := range s.Linux.UIDMappings { + log.Infof("Mapping host uid %d to container uid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.UidMappings = append(cmd.SysProcAttr.UidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } + for _, idMap := range s.Linux.GIDMappings { + log.Infof("Mapping host gid %d to container gid %d (size=%d)", idMap.HostID, idMap.ContainerID, idMap.Size) + cmd.SysProcAttr.GidMappings = append(cmd.SysProcAttr.GidMappings, syscall.SysProcIDMap{ + ContainerID: int(idMap.ContainerID), + HostID: int(idMap.HostID), + Size: int(idMap.Size), + }) + } +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go new file mode 100644 index 000000000..1b6a1d9a6 --- /dev/null +++ b/runsc/sandbox/network.go @@ -0,0 +1,348 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +import ( + "fmt" + "net" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/vishvananda/netlink" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.googlesource.com/gvisor/runsc/boot" +) + +// setupNetwork configures the network stack to mimic the local network +// configuration. Docker uses network namespaces with vnets to configure the +// network for the container. The untrusted app expects to see the same network +// inside the sandbox. Routing and port mapping is handled directly by docker +// with most of network information not even available to the runtime. +// +// Netstack inside the sandbox speaks directly to the device using a raw socket. +// All IP addresses assigned to the NIC, are removed and passed on to netstack's +// device. +// +// If 'conf.Network' is NoNetwork, skips local configuration and creates a +// loopback interface only. +// +// Run the following container to test it: +// docker run -di --runtime=runsc -p 8080:80 -v $PWD:/usr/local/apache2/htdocs/ httpd:2.4 +func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Config) error { + log.Infof("Setting up network") + + // HACK! + // + // When kubernetes starts a pod, it first creates a sandbox with an + // application that just pauses forever. Later, when a container is + // added to the pod, kubernetes will create another sandbox with a + // config that corresponds to the containerized application, and add it + // to the same namespaces as the pause sandbox. + // + // Running a second sandbox currently breaks because the two sandboxes + // have the same network namespace and configuration, and try to create + // a tap device on the same host device which fails. + // + // Runsc will eventually need to detect that this container is meant to + // be run in the same sandbox as the pausing application, and somehow + // make that happen. + // + // For now the following HACK disables networking for the "pause" + // sandbox, allowing the second sandbox to start up successfully. + // + // Cri-o helpfully adds the "ContainerType" annotation that we can use + // to detect whether we are a pod or container. Cri-containerd will + // support this eventually, but does not currently + // (https://github.com/kubernetes-incubator/cri-containerd/issues/512). + // + // Thus, to support cri-containerd, we check if the exec args is + // "/pause", which is pretty gross. + // + // TODO: Remove this once multiple containers per sandbox + // is properly supported. + if spec.Annotations["io.kubernetes.cri-o.ContainerType"] == "sandbox" || spec.Process.Args[0] == "/pause" { + log.Warningf("HACK: Disabling network") + conf.Network = boot.NetworkNone + } + + switch conf.Network { + case boot.NetworkNone: + log.Infof("Network is disabled, create loopback interface only") + if err := createDefaultLoopbackInterface(conn); err != nil { + return fmt.Errorf("error creating default loopback interface: %v", err) + } + case boot.NetworkSandbox: + // Build the path to the net namespace of the sandbox process. + // This is what we will copy. + nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") + if err := createInterfacesAndRoutesFromNS(conn, nsPath); err != nil { + return fmt.Errorf("error creating interfaces from net namespace %q: %v", nsPath, err) + } + case boot.NetworkHost: + // Nothing to do here. + default: + return fmt.Errorf("Invalid network type: %d", conf.Network) + } + return nil +} + +func createDefaultLoopbackInterface(conn *urpc.Client) error { + link := boot.LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []boot.Route{ + { + Destination: net.IP("\x7f\x00\x00\x00"), + Mask: net.IPMask("\xff\x00\x00\x00"), + }, + { + Destination: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", 16)), + }, + }, + } + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ + LoopbackLinks: []boot.LoopbackLink{link}, + }, nil); err != nil { + return fmt.Errorf("error creating loopback link and routes: %v", err) + } + return nil +} + +func joinNetNS(nsPath string) (func(), error) { + runtime.LockOSThread() + restoreNS, err := applyNS(specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + Path: nsPath, + }) + if err != nil { + runtime.UnlockOSThread() + return nil, fmt.Errorf("error joining net namespace %q: %v", nsPath, err) + } + return func() { + restoreNS() + runtime.UnlockOSThread() + }, nil +} + +// isRootNS determines whether we are running in the root net namespace. +// +// TODO: Find a better way to detect root network. +func isRootNS(ifaces []net.Interface) bool { + for _, iface := range ifaces { + if iface.Name == "docker0" { + return true + } + } + return false + +} + +// createInterfacesAndRoutesFromNS scrapes the interface and routes from the +// net namespace with the given path, creates them in the sandbox, and removes +// them from the host. +func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string) error { + // Join the network namespace that we will be copying. + restore, err := joinNetNS(nsPath) + if err != nil { + return err + } + defer restore() + + // Get all interfaces in the namespace. + ifaces, err := net.Interfaces() + if err != nil { + return fmt.Errorf("error querying interfaces: %v", err) + } + + if isRootNS(ifaces) { + return fmt.Errorf("cannot run in with network enabled in root network namespace") + } + + // Collect addresses and routes from the interfaces. + var args boot.CreateLinksAndRoutesArgs + for _, iface := range ifaces { + if iface.Flags&net.FlagUp == 0 { + log.Infof("Skipping down interface: %+v", iface) + continue + } + + ifaddrs, err := iface.Addrs() + if err != nil { + return fmt.Errorf("error fetching interface addresses for %q: %v", iface.Name, err) + } + + // We build our own loopback devices. + if iface.Flags&net.FlagLoopback != 0 { + links, err := loopbackLinks(iface, ifaddrs) + if err != nil { + return fmt.Errorf("error getting loopback routes and links for iface %q: %v", iface.Name, err) + } + args.LoopbackLinks = append(args.LoopbackLinks, links...) + continue + } + + // Get the link for the interface. + ifaceLink, err := netlink.LinkByName(iface.Name) + if err != nil { + return fmt.Errorf("error getting link for interface %q: %v", iface.Name, err) + } + + // Create the socket. + const protocol = 0x0300 // htons(ETH_P_ALL) + fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol) + if err != nil { + return fmt.Errorf("unable to create raw socket: %v", err) + } + deviceFile := os.NewFile(uintptr(fd), "raw-device-fd") + + // Bind to the appropriate device. + ll := syscall.SockaddrLinklayer{ + Protocol: protocol, + Ifindex: ifaceLink.Attrs().Index, + Hatype: 0, // No ARP type. + Pkttype: syscall.PACKET_OTHERHOST, + } + if err := syscall.Bind(fd, &ll); err != nil { + return fmt.Errorf("unable to bind to %q: %v", iface.Name, err) + } + + // Scrape the routes before removing the address, since that + // will remove the routes as well. + routes, def, err := routesForIface(iface) + if err != nil { + return fmt.Errorf("error getting routes for interface %q: %v", iface.Name, err) + } + if def != nil { + if !args.DefaultGateway.Route.Empty() { + return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway) + } + args.DefaultGateway.Route = *def + args.DefaultGateway.Name = iface.Name + } + + link := boot.FDBasedLink{ + Name: iface.Name, + MTU: iface.MTU, + Routes: routes, + } + + // Collect the addresses for the interface, enable forwarding, + // and remove them from the host. + for _, ifaddr := range ifaddrs { + ipNet, ok := ifaddr.(*net.IPNet) + if !ok { + return fmt.Errorf("address is not IPNet: %t %+v", ifaddr, ifaddr) + } + link.Addresses = append(link.Addresses, ipNet.IP) + + // Steal IP address from NIC. + if err := removeAddress(ifaceLink, ipNet.String()); err != nil { + return fmt.Errorf("error removing address %v from device %q: %v", iface.Name, ipNet, err) + } + } + + args.FilePayload.Files = append(args.FilePayload.Files, deviceFile) + args.FDBasedLinks = append(args.FDBasedLinks, link) + } + + log.Debugf("Setting up network, config: %+v", args) + if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { + return fmt.Errorf("error creating links and routes: %v", err) + } + return nil +} + +// loopbackLinks collects the links for a loopback interface. +func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) { + var links []boot.LoopbackLink + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok { + return nil, fmt.Errorf("address is not IPNet: %t %+v", addr, addr) + } + links = append(links, boot.LoopbackLink{ + Name: iface.Name, + Addresses: []net.IP{ipNet.IP}, + Routes: []boot.Route{{ + Destination: ipNet.IP.Mask(ipNet.Mask), + Mask: ipNet.Mask, + }}, + }) + } + return links, nil +} + +// routesForIface iterates over all routes for the given interface and converts +// them to boot.Routes. +func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) { + link, err := netlink.LinkByIndex(iface.Index) + if err != nil { + return nil, nil, err + } + rs, err := netlink.RouteList(link, netlink.FAMILY_ALL) + if err != nil { + return nil, nil, fmt.Errorf("error getting routes from %q: %v", iface.Name, err) + } + + var def *boot.Route + var routes []boot.Route + for _, r := range rs { + // Is it a default route? + if r.Dst == nil { + if r.Gw == nil { + return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r) + } + if def != nil { + return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r) + } + emptyAddr := net.IPv6zero + if r.Gw.To4() != nil { + emptyAddr = net.IPv4zero + } + // Create a catch all route to the gateway. + def = &boot.Route{ + Destination: emptyAddr, + Mask: net.IPMask(emptyAddr), + Gateway: r.Gw, + } + continue + } + routes = append(routes, boot.Route{ + Destination: r.Dst.IP.Mask(r.Dst.Mask), + Mask: r.Dst.Mask, + }) + } + return routes, def, nil +} + +// removeAddress removes IP address from network device. It's equivalent to: +// ip addr del <ipAndMask> dev <name> +func removeAddress(source netlink.Link, ipAndMask string) error { + addr, err := netlink.ParseAddr(ipAndMask) + if err != nil { + return err + } + return netlink.AddrDel(source, addr) +} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go new file mode 100644 index 000000000..b2fa1d58e --- /dev/null +++ b/runsc/sandbox/sandbox.go @@ -0,0 +1,666 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sandbox creates and manipulates sandboxes. +package sandbox + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "syscall" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/control/client" + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// metadataFilename is the name of the metadata file relative to sandboxRoot +// that holds sandbox metadata. +const metadataFilename = "meta.json" + +// See libcontainer/factory_linux.go +var idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + +// validateID validates the sandbox id. +func validateID(id string) error { + if !idRegex.MatchString(id) { + return fmt.Errorf("invalid sandbox id: %v", id) + } + return nil +} + +// Sandbox wraps a child sandbox process, and is responsible for saving and +// loading sandbox metadata to disk. +// +// Within a root directory, we maintain subdirectories for each sandbox named +// with the sandbox id. The sandbox metadata is is stored as json within the +// sandbox directoy in a file named "meta.json". This metadata format is +// defined by us, and is not part of the OCI spec. +// +// Sandboxes must write this metadata file after any change to their internal +// state. The entire sandbox directory is deleted when the sandbox is +// destroyed. +// +// TODO: Protect against concurrent changes to the sandbox metadata +// file. +type Sandbox struct { + // ID is the sandbox ID. + ID string `json:"id"` + + // Spec is the OCI runtime spec that configures this sandbox. + Spec *specs.Spec `json:"spec"` + + // BundleDir is the directory containing the sandbox bundle. + BundleDir string `json:"bundleDir"` + + // SandboxRoot is the directory containing the sandbox metadata file. + SandboxRoot string `json:"sandboxRoot"` + + // CreatedAt is the time the sandbox was created. + CreatedAt time.Time `json:"createdAt"` + + // Owner is the sandbox owner. + Owner string `json:"owner"` + + // ConsoleSocket is the path to a unix domain socket that will receive + // the console FD. It is only used during create, so we don't need to + // store it in the metadata. + ConsoleSocket string `json:"-"` + + // Pid is the pid of the running sandbox. Only valid if Status is + // Created or Running. + Pid int `json:"pid"` + + // GoferPid is the pid of the gofer running along side the sandbox. May be 0 + // if the gofer has been killed or it's not being used. + GoferPid int `json:"goferPid"` + + // Status is the current sandbox Status. + Status Status `json:"status"` +} + +// Create creates the sandbox subprocess and writes the metadata file. Args +// are additional arguments that will be passed to the sandbox process. +func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) { + log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir) + if err := validateID(id); err != nil { + return nil, err + } + + sandboxRoot := filepath.Join(conf.RootDir, id) + if exists(sandboxRoot) { + return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot) + } + + s := &Sandbox{ + ID: id, + Spec: spec, + ConsoleSocket: consoleSocket, + BundleDir: bundleDir, + SandboxRoot: sandboxRoot, + Status: Creating, + Owner: os.Getenv("USER"), + } + + // Create sandbox process. If anything errors between now and the end of this + // function, we MUST clean up all sandbox resources. + if err := s.createProcesses(conf, args); err != nil { + s.Destroy() + return nil, err + } + + // Wait for the control server to come up (or timeout). The sandbox is + // not "created" until that happens. + if err := s.waitForCreated(10 * time.Second); err != nil { + s.Destroy() + return nil, err + } + + s.Status = Created + s.CreatedAt = time.Now() + + // Save the metadata file. + if err := s.save(); err != nil { + s.Destroy() + return nil, err + } + + // Write the pid file. Containerd consideres the create complete after + // this file is created, so it must be the last thing we do. + if pidFile != "" { + if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil { + s.Destroy() + return nil, fmt.Errorf("error writing pid file: %v", err) + } + } + + return s, nil +} + +// Run is a helper that calls Create + Start + Wait. +func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) { + s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args) + if err != nil { + return 0, fmt.Errorf("error creating sandbox: %v", err) + } + if err := s.Start(conf); err != nil { + return 0, fmt.Errorf("error starting sandbox: %v", err) + } + return s.Wait() +} + +// Load loads a sandbox from with the given id from a metadata file. +func Load(rootDir, id string) (*Sandbox, error) { + log.Debugf("Load sandbox %q %q", rootDir, id) + if err := validateID(id); err != nil { + return nil, err + } + sandboxRoot := filepath.Join(rootDir, id) + if !exists(sandboxRoot) { + return nil, fmt.Errorf("sandbox with id %q does not exist", id) + } + metaFile := filepath.Join(sandboxRoot, metadataFilename) + if !exists(metaFile) { + return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile) + } + metaBytes, err := ioutil.ReadFile(metaFile) + if err != nil { + return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err) + } + var s Sandbox + if err := json.Unmarshal(metaBytes, &s); err != nil { + return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err) + } + + // If the status is "Running" or "Created", check that the process + // still exists, and set it to Stopped if it does not. + // + // This is inherintly racey. + if s.Status == Running || s.Status == Created { + // Send signal 0 to check if process exists. + if err := s.Signal(0); err != nil { + // Process no longer exists. + s.Status = Stopped + s.Pid = 0 + } + } + + return &s, nil +} + +// List returns all sandbox ids in the given root directory. +func List(rootDir string) ([]string, error) { + log.Debugf("List sandboxes %q", rootDir) + fs, err := ioutil.ReadDir(rootDir) + if err != nil { + return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err) + } + var out []string + for _, f := range fs { + out = append(out, f.Name()) + } + return out, nil +} + +// State returns the metadata of the sandbox. +func (s *Sandbox) State() specs.State { + return specs.State{ + Version: specs.Version, + ID: s.ID, + Status: s.Status.String(), + Pid: s.Pid, + Bundle: s.BundleDir, + } +} + +// Start starts running the containerized process inside the sandbox. +func (s *Sandbox) Start(conf *boot.Config) error { + log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid) + if s.Status != Created { + return fmt.Errorf("cannot start container in state %s", s.Status) + } + + // "If any prestart hook fails, the runtime MUST generate an error, + // stop and destroy the container". + if s.Spec.Hooks != nil { + if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil { + s.Destroy() + return err + } + } + + c, err := s.connect() + if err != nil { + s.Destroy() + return err + } + defer c.Close() + + // Configure the network. + if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil { + s.Destroy() + return fmt.Errorf("error setting up network: %v", err) + } + + // Send a message to the sandbox control server to start the + // application. + if err := c.Call(boot.ApplicationStart, nil, nil); err != nil { + s.Destroy() + return fmt.Errorf("error starting sandbox: %v", err) + } + + // "If any poststart hook fails, the runtime MUST log a warning, but + // the remaining hooks and lifecycle continue as if the hook had + // succeeded". + if s.Spec.Hooks != nil { + executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State()) + } + + s.Status = Running + return s.save() +} + +// Processes retrieves the list of processes and associated metadata inside a +// sandbox. +func (s *Sandbox) Processes() ([]*control.Process, error) { + if s.Status != Running { + return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status) + } + + c, err := s.connect() + if err != nil { + return nil, err + } + defer c.Close() + + var pl []*control.Process + if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil { + return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err) + } + return pl, nil +} + +// Execute runs the specified command in the sandbox. +func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) { + log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e) + if s.Status != Created && s.Status != Running { + return 0, fmt.Errorf("cannot exec in container in state %s", s.Status) + } + + log.Debugf("Connecting to sandbox...") + c, err := s.connect() + if err != nil { + return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) + } + defer c.Close() + + // Send a message to the sandbox control server to start the application. + var waitStatus uint32 + if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil { + return 0, fmt.Errorf("error executing in sandbox: %v", err) + } + + return syscall.WaitStatus(waitStatus), nil +} + +// Event retrieves stats about the sandbox such as memory and CPU utilization. +func (s *Sandbox) Event() (*boot.Event, error) { + if s.Status != Running && s.Status != Created { + return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status) + } + + c, err := s.connect() + if err != nil { + return nil, err + } + defer c.Close() + + var e boot.Event + if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil { + return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err) + } + e.ID = s.ID + return &e, nil +} + +func (s *Sandbox) connect() (*urpc.Client, error) { + log.Debugf("Connecting to sandbox...") + c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) + if err != nil { + return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) + } + return c, nil +} + +func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error { + binPath, err := specutils.BinPath() + if err != nil { + return err + } + + ioFiles, err := s.createGoferProcess(conf, binPath, args) + if err != nil { + return err + } + return s.createSandboxProcess(conf, binPath, args, ioFiles) +} + +func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) { + if conf.FileAccess != boot.FileAccessProxy { + // Don't start a gofer. The sandbox will access host FS directly. + return nil, nil + } + + var args []string + args = append(args, commonArgs...) + args = append(args, "gofer", "--bundle", s.BundleDir) + + // Start with root mount and then add any other additional mount. + mountCount := 1 + for _, m := range s.Spec.Mounts { + if specutils.Is9PMount(m) { + mountCount++ + } + } + + sandEnds := make([]*os.File, 0, mountCount) + goferEnds := make([]*os.File, 0, mountCount) + for i := 0; i < mountCount; i++ { + // Create socket that connects the sandbox and gofer. + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + return nil, err + } + sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) + + goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") + defer goferEnd.Close() + goferEnds = append(goferEnds, goferEnd) + + args = append(args, fmt.Sprintf("--io-fds=%d", 3+i)) + } + + cmd := exec.Command(binPath, args...) + cmd.ExtraFiles = goferEnds + + // Setup any uid/gid mappings, and create or join the configured user + // namespace so the gofer's view of the filesystem aligns with the + // users in the sandbox. + setUIDGIDMappings(cmd, s.Spec) + nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec) + + // Start the gofer in the given namespace. + log.Debugf("Starting gofer: %s %v", binPath, args) + if err := startInNS(cmd, nss); err != nil { + return nil, err + } + s.GoferPid = cmd.Process.Pid + log.Infof("Gofer started, pid: %d", cmd.Process.Pid) + return sandEnds, nil +} + +// createSandboxProcess starts the sandbox as a subprocess by running the "boot" +// command, passing in the bundle dir. +func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error { + // nextFD is used to get unused FDs that we can pass to the sandbox. It + // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. + nextFD := 3 + + // Create control server socket here and donate FD to child process because + // it may be in a different network namespace and won't be reachable from + // outside. + fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID)) + if err != nil { + return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) + } + + consoleEnabled := s.ConsoleSocket != "" + + cmd := exec.Command(binPath, commonArgs...) + cmd.SysProcAttr = &syscall.SysProcAttr{} + cmd.Args = append(cmd.Args, + "boot", + "--bundle", s.BundleDir, + "--controller-fd="+strconv.Itoa(nextFD), + fmt.Sprintf("--console=%t", consoleEnabled)) + nextFD++ + + controllerFile := os.NewFile(uintptr(fd), "control_server_socket") + defer controllerFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) + + // If there is a gofer, sends all socket ends to the sandbox. + for _, f := range ioFiles { + defer f.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + // If the console control socket file is provided, then create a new + // pty master/slave pair and set the tty on the sandox process. + if consoleEnabled { + // setupConsole will send the master on the socket, and return + // the slave. + tty, err := setupConsole(s.ConsoleSocket) + if err != nil { + return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err) + } + defer tty.Close() + + cmd.Stdin = tty + cmd.Stdout = tty + cmd.Stderr = tty + cmd.SysProcAttr.Setctty = true + cmd.SysProcAttr.Ctty = int(tty.Fd()) + } else { + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + + // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT + // when re-parented. + cmd.SysProcAttr.Setsid = true + + // nss is the set of namespaces to join or create before starting the sandbox + // process. IPC and UTS namespaces from the host are not used as they + // are virtualized inside the sandbox. Be paranoid and run inside an empty + // namespace for these. + log.Infof("Sandbox will be started in empty IPC and UTS namespaces") + nss := []specs.LinuxNamespace{ + specs.LinuxNamespace{Type: specs.IPCNamespace}, + specs.LinuxNamespace{Type: specs.UTSNamespace}, + } + + if conf.Platform == boot.PlatformPtrace { + // TODO: Also set an empty PID namespace so that we limit + // access to other host processes. + log.Infof("Sandbox will be started in the current PID namespace") + } else { + log.Infof("Sandbox will be started in empty PID namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) + } + + if conf.FileAccess == boot.FileAccessProxy { + log.Infof("Sandbox will be started in empty mount namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace}) + } else { + log.Infof("Sandbox will be started in the current mount namespace") + } + + // Joins the network namespace if network is enabled. the sandbox talks + // directly to the host network, which may have been configured in the + // namespace. + if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone { + log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) + nss = append(nss, ns) + } else { + log.Infof("Sandbox will be started in empty network namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) + } + + // User namespace depends on the following options: + // - Host network/filesystem: requires to run inside the user namespace + // specified in the spec or the current namespace if none is configured. + // - Gofer: when using a Gofer, the sandbox process can run isolated in an + // empty namespace. + if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect { + if userns, ok := getNS(specs.UserNamespace, s.Spec); ok { + log.Infof("Sandbox will be started in container's user namespace: %+v", userns) + nss = append(nss, userns) + setUIDGIDMappings(cmd, s.Spec) + } else { + // TODO: Retrict capabilities since it's using current user + // namespace, i.e. root. + log.Infof("Sandbox will be started in the current user namespace") + } + // When running in the caller's defined user namespace, apply the same + // capabilities to the sandbox process to ensure it abides to the same + // rules. + cmd.Args = append(cmd.Args, "--apply-caps=true") + + } else { + log.Infof("Sandbox will be started in empty user namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) + } + + log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) + if err := startInNS(cmd, nss); err != nil { + return err + } + s.Pid = cmd.Process.Pid + log.Infof("Sandbox started, pid: %d", s.Pid) + return nil +} + +// waitForCreated waits for the sandbox subprocess control server to be +// running, at which point the sandbox is in Created state. +func (s *Sandbox) waitForCreated(timeout time.Duration) error { + log.Debugf("Waiting for sandbox %q creation", s.ID) + tchan := time.After(timeout) + for { + select { + case <-tchan: + return fmt.Errorf("timed out waiting for sandbox control server") + default: + if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil { + // It's alive! + c.Close() + return nil + } + } + } +} + +// Wait waits for the containerized process to exit, and returns its WaitStatus. +func (s *Sandbox) Wait() (syscall.WaitStatus, error) { + log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid) + p, err := os.FindProcess(s.Pid) + if err != nil { + // "On Unix systems, FindProcess always succeeds and returns a + // Process for the given pid." + panic(err) + } + ps, err := p.Wait() + if err != nil { + return 0, err + } + return ps.Sys().(syscall.WaitStatus), nil +} + +// Destroy frees all resources associated with the sandbox. +func (s *Sandbox) Destroy() error { + log.Debugf("Destroy sandbox %q", s.ID) + if s.Pid != 0 { + // TODO: Too harsh? + log.Debugf("Killing sandbox %q", s.ID) + sendSignal(s.Pid, unix.SIGKILL) + s.Pid = 0 + } + if s.GoferPid != 0 { + log.Debugf("Killing gofer for sandbox %q", s.ID) + sendSignal(s.GoferPid, unix.SIGKILL) + s.GoferPid = 0 + } + if err := os.RemoveAll(s.SandboxRoot); err != nil { + log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err) + } + + // "If any poststop hook fails, the runtime MUST log a warning, but the + // remaining hooks and lifecycle continue as if the hook had succeeded". + if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) { + executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State()) + } + + s.Status = Stopped + return nil +} + +// Signal sends the signal to the sandbox. +func (s *Sandbox) Signal(sig syscall.Signal) error { + log.Debugf("Signal sandbox %q", s.ID) + if s.Status == Stopped { + log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid) + return nil + } + return sendSignal(s.Pid, sig) +} + +func sendSignal(pid int, sig syscall.Signal) error { + if err := syscall.Kill(pid, sig); err != nil { + return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err) + } + return nil +} + +// save saves the sandbox metadata to a file. +func (s *Sandbox) save() error { + log.Debugf("Save sandbox %q", s.ID) + if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil { + return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err) + } + meta, err := json.Marshal(s) + if err != nil { + return fmt.Errorf("error marshaling sandbox metadata: %v", err) + } + metaFile := filepath.Join(s.SandboxRoot, metadataFilename) + if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil { + return fmt.Errorf("error writing sandbox metadata: %v", err) + } + return nil +} + +// exists returns true if the given file exists. +func exists(f string) bool { + if _, err := os.Stat(f); err == nil { + return true + } else if !os.IsNotExist(err) { + log.Warningf("error checking for file %q: %v", f, err) + } + return false +} diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go new file mode 100644 index 000000000..6c71cac30 --- /dev/null +++ b/runsc/sandbox/sandbox_test.go @@ -0,0 +1,649 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox_test + +import ( + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/signal" + "path/filepath" + "reflect" + "strings" + "syscall" + "testing" + "time" + + "context" + "flag" + "github.com/google/subcommands" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/cmd" + "gvisor.googlesource.com/gvisor/runsc/sandbox" +) + +func init() { + log.SetLevel(log.Debug) +} + +// writeSpec writes the spec to disk in the given directory. +func writeSpec(dir string, spec *specs.Spec) error { + b, err := json.Marshal(spec) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755) +} + +// newSpecWithArgs creates a simple spec with the given args suitable for use +// in tests. +func newSpecWithArgs(args ...string) *specs.Spec { + spec := &specs.Spec{ + // The host filesystem root is the sandbox root. + Root: &specs.Root{ + Path: "/", + Readonly: true, + }, + Process: &specs.Process{ + Args: args, + Env: []string{ + "PATH=" + os.Getenv("PATH"), + }, + }, + } + return spec +} + +// shutdownSignal will be sent to the sandbox in order to shut down cleanly. +const shutdownSignal = syscall.SIGUSR2 + +// setupSandbox creates a bundle and root dir for the sandbox, generates a test +// config, and writes the spec to config.json in the bundle dir. +func setupSandbox(spec *specs.Spec) (rootDir, bundleDir string, conf *boot.Config, err error) { + rootDir, err = ioutil.TempDir("", "sandboxes") + if err != nil { + return "", "", nil, fmt.Errorf("error creating root dir: %v", err) + } + + bundleDir, err = ioutil.TempDir("", "bundle") + if err != nil { + return "", "", nil, fmt.Errorf("error creating bundle dir: %v", err) + } + + if err = writeSpec(bundleDir, spec); err != nil { + return "", "", nil, fmt.Errorf("error writing spec: %v", err) + } + + conf = &boot.Config{ + RootDir: rootDir, + Network: boot.NetworkNone, + } + + return rootDir, bundleDir, conf, nil +} + +// uniqueSandboxID generates a unique sandbox id for each test. +// +// The sandbox id is used to create an abstract unix domain socket, which must +// be unique. While the sandbox forbids creating two sandboxes with the same +// name, sometimes between test runs the socket does not get cleaned up quickly +// enough, causing sandbox creation to fail. +func uniqueSandboxID() string { + return fmt.Sprintf("test-sandbox-%d", time.Now().UnixNano()) +} + +// waitForProcessList waits for the given process list to show up in the sandbox. +func waitForProcessList(s *sandbox.Sandbox, expected []*control.Process) error { + var got []*control.Process + for start := time.Now(); time.Now().Sub(start) < 10*time.Second; { + var err error + got, err := s.Processes() + if err != nil { + return fmt.Errorf("error getting process data from sandbox: %v", err) + } + if procListsEqual(got, expected) { + return nil + } + // Process might not have started, try again... + time.Sleep(10 * time.Millisecond) + } + return fmt.Errorf("sandbox got process list: %s, want: %s", procListToString(got), procListToString(expected)) +} + +// TestLifecycle tests the basic Create/Start/Signal/Destory sandbox lifecycle. +// It verifies after each step that the sandbox can be loaded from disk, and +// has the correct status. +func TestLifecycle(t *testing.T) { + // The sandbox will just sleep for a long time. We will kill it before + // it finishes sleeping. + spec := newSpecWithArgs("sleep", "100") + + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // expectedPL lists the expected process state of the sandbox. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + } + // Create the sandbox. + id := uniqueSandboxID() + if _, err := sandbox.Create(id, spec, conf, bundleDir, "", "", nil); err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + // Load the sandbox from disk and check the status. + s, err := sandbox.Load(rootDir, id) + if err != nil { + t.Fatalf("error loading sandbox: %v", err) + } + if got, want := s.Status, sandbox.Created; got != want { + t.Errorf("sandbox status got %v, want %v", got, want) + } + + // List should return the sandbox id. + ids, err := sandbox.List(rootDir) + if err != nil { + t.Fatalf("error listing sandboxes: %v", err) + } + if got, want := ids, []string{id}; !reflect.DeepEqual(got, want) { + t.Errorf("sandbox list got %v, want %v", got, want) + } + + // Start the sandbox. + if err := s.Start(conf); err != nil { + t.Fatalf("error starting sandbox: %v", err) + } + // Load the sandbox from disk and check the status. + s, err = sandbox.Load(rootDir, id) + if err != nil { + t.Fatalf("error loading sandbox: %v", err) + } + if got, want := s.Status, sandbox.Running; got != want { + t.Errorf("sandbox status got %v, want %v", got, want) + } + + // Verify that "sleep 100" is running. + if err := waitForProcessList(s, expectedPL); err != nil { + t.Error(err) + } + + // Send the sandbox a signal, which we catch and use to cleanly + // shutdown. + if err := s.Signal(shutdownSignal); err != nil { + t.Fatalf("error sending signal %v to sandbox: %v", shutdownSignal, err) + } + // Wait for it to die. + if _, err := s.Wait(); err != nil { + t.Fatalf("error waiting on sandbox: %v", err) + } + // Load the sandbox from disk and check the status. + s, err = sandbox.Load(rootDir, id) + if err != nil { + t.Fatalf("error loading sandbox: %v", err) + } + if got, want := s.Status, sandbox.Stopped; got != want { + t.Errorf("sandbox status got %v, want %v", got, want) + } + + // Destroy the sandbox. + if err := s.Destroy(); err != nil { + t.Fatalf("error destroying sandbox: %v", err) + } + + // List should not return the sandbox id. + ids, err = sandbox.List(rootDir) + if err != nil { + t.Fatalf("error listing sandboxes: %v", err) + } + if len(ids) != 0 { + t.Errorf("expected sandbox list to be empty, but got %v", ids) + } + + // Loading the sandbox by id should fail. + if _, err = sandbox.Load(rootDir, id); err == nil { + t.Errorf("expected loading destroyed sandbox to fail, but it did not") + } +} + +// Test the we can execute the application with different path formats. +func TestExePath(t *testing.T) { + for _, test := range []struct { + path string + success bool + }{ + {path: "true", success: true}, + {path: "bin/true", success: true}, + {path: "/bin/true", success: true}, + {path: "thisfiledoesntexit", success: false}, + {path: "bin/thisfiledoesntexit", success: false}, + {path: "/bin/thisfiledoesntexit", success: false}, + } { + spec := newSpecWithArgs(test.path) + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("exec: %s, error setting up sandbox: %v", test.path, err) + } + + ws, err := sandbox.Run(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil) + + os.RemoveAll(rootDir) + os.RemoveAll(bundleDir) + + if test.success { + if err != nil { + t.Errorf("exec: %s, error running sandbox: %v", test.path, err) + } + if ws.ExitStatus() != 0 { + t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0) + } + } else { + if err == nil { + t.Errorf("exec: %s, got: no error, want: error", test.path) + } + } + } +} + +// Test the we can retrieve the application exit status from the sandbox. +func TestAppExitStatus(t *testing.T) { + // First sandbox will succeed. + succSpec := newSpecWithArgs("true") + + rootDir, bundleDir, conf, err := setupSandbox(succSpec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + ws, err := sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir, "", "", nil) + if err != nil { + t.Fatalf("error running sandbox: %v", err) + } + if ws.ExitStatus() != 0 { + t.Errorf("got exit status %v want %v", ws.ExitStatus(), 0) + } + + // Second sandbox exits with non-zero status. + wantStatus := 123 + errSpec := newSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus)) + + rootDir2, bundleDir2, conf, err := setupSandbox(errSpec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir2) + defer os.RemoveAll(bundleDir2) + + ws, err = sandbox.Run(uniqueSandboxID(), succSpec, conf, bundleDir2, "", "", nil) + if err != nil { + t.Fatalf("error running sandbox: %v", err) + } + if ws.ExitStatus() != wantStatus { + t.Errorf("got exit status %v want %v", ws.ExitStatus(), wantStatus) + } +} + +// TestExec verifies that a sandbox can exec a new program. +func TestExec(t *testing.T) { + const uid = 343 + spec := newSpecWithArgs("sleep", "100") + + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the sandbox. + s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil) + if err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + defer s.Destroy() + if err := s.Start(conf); err != nil { + t.Fatalf("error starting sandbox: %v", err) + } + + // expectedPL lists the expected process state of the sandbox. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + { + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + } + + // Verify that "sleep 100" is running. + if err := waitForProcessList(s, expectedPL[:1]); err != nil { + t.Error(err) + } + + execArgs := control.ExecArgs{ + Filename: "/bin/sleep", + Argv: []string{"sleep", "5"}, + Envv: []string{"PATH=" + os.Getenv("PATH")}, + WorkingDirectory: "/", + KUID: uid, + Detach: false, + } + + // Verify that "sleep 100" and "sleep 5" are running after exec. + // First, start running exec (whick blocks). + status := make(chan error, 1) + go func() { + exitStatus, err := s.Execute(&execArgs) + if err != nil { + status <- err + } else if exitStatus != 0 { + status <- fmt.Errorf("failed with exit status: %v", exitStatus) + } else { + status <- nil + } + }() + + if err := waitForProcessList(s, expectedPL); err != nil { + t.Fatal(err) + } + + // Ensure that exec finished without error. + select { + case <-time.After(10 * time.Second): + t.Fatalf("sandbox timed out waiting for exec to finish.") + case st := <-status: + if st != nil { + t.Errorf("sandbox failed to exec %v: %v", execArgs, err) + } + } +} + +// TestCapabilities verifies that: +// - Running exec as non-root UID and GID will result in an error (because the +// executable file can't be read). +// - Running exec as non-root with CAP_DAC_OVERRIDE succeeds because it skips +// this check. +func TestCapabilities(t *testing.T) { + const uid = 343 + const gid = 2401 + spec := newSpecWithArgs("sleep", "100") + + // We generate files in the host temporary directory. + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: os.TempDir(), + Source: os.TempDir(), + Type: "bind", + }) + + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and start the sandbox. + s, err := sandbox.Create(uniqueSandboxID(), spec, conf, bundleDir, "", "", nil) + if err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + defer s.Destroy() + if err := s.Start(conf); err != nil { + t.Fatalf("error starting sandbox: %v", err) + } + + // expectedPL lists the expected process state of the sandbox. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + }, + { + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "exe", + }, + } + if err := waitForProcessList(s, expectedPL[:1]); err != nil { + t.Fatalf("Failed to wait for sleep to start, err: %v", err) + } + + // Create an executable that can't be run with the specified UID:GID. + // This shouldn't be callable within the sandbox until we add the + // CAP_DAC_OVERRIDE capability to skip the access check. + exePath := filepath.Join(rootDir, "exe") + if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil { + t.Fatalf("couldn't create executable: %v", err) + } + defer os.Remove(exePath) + + // Need to traverse the intermediate directory. + os.Chmod(rootDir, 0755) + + execArgs := control.ExecArgs{ + Filename: exePath, + Argv: []string{exePath}, + Envv: []string{"PATH=" + os.Getenv("PATH")}, + WorkingDirectory: "/", + KUID: uid, + KGID: gid, + Capabilities: &auth.TaskCapabilities{}, + Detach: true, + } + + // "exe" should fail because we don't have the necessary permissions. + if _, err := s.Execute(&execArgs); err == nil { + t.Fatalf("sandbox executed without error, but an error was expected") + } + + // Now we run with the capability enabled and should succeed. + execArgs.Capabilities = &auth.TaskCapabilities{ + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + } + // First, start running exec. + if _, err := s.Execute(&execArgs); err != nil { + t.Fatalf("sandbox failed to exec %v: %v", execArgs, err) + } + + if err := waitForProcessList(s, expectedPL); err != nil { + t.Error(err) + } +} + +// Test that an tty FD is sent over the console socket if one is provided. +func TestConsoleSocket(t *testing.T) { + spec := newSpecWithArgs("true") + rootDir, bundleDir, conf, err := setupSandbox(spec) + if err != nil { + t.Fatalf("error setting up sandbox: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create a named socket and start listening. We use a relative path + // to avoid overflowing the unix path length limit (108 chars). + socketPath := filepath.Join(bundleDir, "socket") + cwd, err := os.Getwd() + if err != nil { + t.Fatalf("error getting cwd: %v", err) + } + socketRelPath, err := filepath.Rel(cwd, socketPath) + if err != nil { + t.Fatalf("error getting relative path for %q from cwd %q: %v", socketPath, cwd, err) + } + if len(socketRelPath) > len(socketPath) { + socketRelPath = socketPath + } + srv, err := unet.BindAndListen(socketRelPath, false) + if err != nil { + t.Fatalf("error binding and listening to socket %q: %v", socketPath, err) + } + defer os.Remove(socketPath) + + // Create the sandbox and pass the socket name. + id := uniqueSandboxID() + s, err := sandbox.Create(id, spec, conf, bundleDir, socketRelPath, "", nil) + if err != nil { + t.Fatalf("error creating sandbox: %v", err) + } + + // Open the othe end of the socket. + sock, err := srv.Accept() + if err != nil { + t.Fatalf("error accepting socket connection: %v", err) + } + + // Allow 3 fds to be received. We only expect 1. + r := sock.Reader(true /* blocking */) + r.EnableFDs(1) + + // The socket is closed right after sending the FD, so EOF is + // an allowed error. + b := [][]byte{{}} + if _, err := r.ReadVec(b); err != nil && err != io.EOF { + t.Fatalf("error reading from socket connection: %v", err) + } + + // We should have gotten a control message. + fds, err := r.ExtractFDs() + if err != nil { + t.Fatalf("error extracting fds from socket connection: %v", err) + } + if len(fds) != 1 { + t.Fatalf("got %d fds from socket, wanted 1", len(fds)) + } + + // Verify that the fd is a terminal. + if _, err := unix.IoctlGetTermios(fds[0], unix.TCGETS); err != nil { + t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err) + } + + // Shut it down. + if err := s.Destroy(); err != nil { + t.Fatalf("error destroying sandbox: %v", err) + } + + // Close socket. + if err := srv.Close(); err != nil { + t.Fatalf("error destroying sandbox: %v", err) + } +} + +// procListsEqual is used to check whether 2 Process lists are equal for all +// implemented fields. +func procListsEqual(got, want []*control.Process) bool { + if len(got) != len(want) { + return false + } + for i := range got { + pd1 := got[i] + pd2 := want[i] + // Zero out unimplemented and timing dependant fields. + pd1.Time, pd2.Time = "", "" + pd1.STime, pd2.STime = "", "" + pd1.C, pd2.C = 0, 0 + if *pd1 != *pd2 { + return false + } + } + return true +} + +func procListToString(pl []*control.Process) string { + strs := make([]string, 0, len(pl)) + for _, p := range pl { + strs = append(strs, fmt.Sprintf("%+v", p)) + } + return fmt.Sprintf("[%s]", strings.Join(strs, ",")) +} + +// TestMain acts like runsc if it is called with the "boot" argument, otherwise +// it just runs the tests. This is required because creating a sandbox will +// call "/proc/self/exe boot". Normally /proc/self/exe is the runsc binary, +// but for tests we have to fake it. +func TestMain(m *testing.M) { + // exit writes coverage data before exiting. + exit := func(status int) { + os.Exit(status) + } + + if !flag.Parsed() { + flag.Parse() + } + + // If we are passed one of the commands then run it. + subcommands.Register(new(cmd.Boot), "boot") + subcommands.Register(new(cmd.Gofer), "gofer") + switch flag.Arg(0) { + case "boot", "gofer": + // Run the command in a goroutine so we can block the main + // thread waiting for shutdownSignal. + go func() { + conf := &boot.Config{ + RootDir: "unused-root-dir", + Network: boot.NetworkNone, + } + var ws syscall.WaitStatus + subcmdCode := subcommands.Execute(context.Background(), conf, &ws) + if subcmdCode != subcommands.ExitSuccess { + panic(fmt.Sprintf("command failed to execute, err: %v", subcmdCode)) + } + // Sandbox exited normally. Shut down this process. + os.Exit(ws.ExitStatus()) + }() + + // Shutdown cleanly when the shutdownSignal is received. This + // allows us to write coverage data before exiting. + sigc := make(chan os.Signal, 1) + signal.Notify(sigc, shutdownSignal) + <-sigc + exit(0) + default: + // Otherwise run the tests. + exit(m.Run()) + } +} diff --git a/runsc/sandbox/status.go b/runsc/sandbox/status.go new file mode 100644 index 000000000..6fc936aba --- /dev/null +++ b/runsc/sandbox/status.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sandbox + +// Status enumerates sandbox statuses. The statuses and their semantics are +// part of the runtime CLI spec. +// +// TODO: Get precise about the transitions between statuses. +type Status int + +const ( + // Creating indicates "the container is being created". + Creating Status = iota + + // Created indicates "the runtime has finished the create operation and + // the container process has neither exited nor executed the + // user-specified program". + Created + + // Running indicates "the container process has executed the + // user-specified program but has not exited". + Running + + // Stopped indicates "the container process has exited". + Stopped +) + +// String converts a Status to a string. These strings are part of the runtime +// CLI spec and should not be changed. +func (s Status) String() string { + switch s { + case Creating: + return "creating" + case Created: + return "created" + case Running: + return "running" + case Stopped: + return "stopped" + default: + return "unknown" + } + +} diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD new file mode 100644 index 000000000..ae89260d2 --- /dev/null +++ b/runsc/specutils/BUILD @@ -0,0 +1,18 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "specutils", + srcs = ["specutils.go"], + importpath = "gvisor.googlesource.com/gvisor/runsc/specutils", + visibility = [ + "//runsc:__subpackages__", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/kernel/auth", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go new file mode 100644 index 000000000..bed0f75eb --- /dev/null +++ b/runsc/specutils/specutils.go @@ -0,0 +1,183 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package specutils contains utility functions for working with OCI runtime +// specs. +package specutils + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" +) + +// LogSpec logs the spec in a human-friendly way. +func LogSpec(spec *specs.Spec) { + log.Debugf("Spec: %+v", spec) + log.Debugf("Spec.Hooks: %+v", spec.Hooks) + log.Debugf("Spec.Linux: %+v", spec.Linux) + log.Debugf("Spec.Process: %+v", spec.Process) + log.Debugf("Spec.Root: %+v", spec.Root) +} + +// ReadSpec reads an OCI runtime spec from the given bundle directory. +// +// TODO: This should validate the spec. +func ReadSpec(bundleDir string) (*specs.Spec, error) { + // The spec file must be in "config.json" inside the bundle directory. + specFile := filepath.Join(bundleDir, "config.json") + specBytes, err := ioutil.ReadFile(specFile) + if err != nil { + return nil, fmt.Errorf("error reading spec from file %q: %v", specFile, err) + } + var spec specs.Spec + if err := json.Unmarshal(specBytes, &spec); err != nil { + return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile, err, string(specBytes)) + } + return &spec, nil +} + +// GetExecutablePath returns the absolute path to the executable, relative to +// the root. It searches the environment PATH for the first file that exists +// with the given name. +func GetExecutablePath(exec, root string, env []string) (string, error) { + exec = filepath.Clean(exec) + + // Don't search PATH if exec is a path to a file (absolute or relative). + if strings.IndexByte(exec, '/') >= 0 { + return exec, nil + } + + // Get the PATH from the environment. + const prefix = "PATH=" + var path []string + for _, e := range env { + if strings.HasPrefix(e, prefix) { + path = strings.Split(strings.TrimPrefix(e, prefix), ":") + break + } + } + + // Search the PATH for a file whose name matches the one we are looking + // for. + for _, p := range path { + abs := filepath.Join(root, p, exec) + if _, err := os.Stat(abs); err == nil { + // We found it! Return the path relative to the root. + return filepath.Join("/", p, exec), nil + } + } + + // Could not find a suitable path, just return the original string. + log.Warningf("could not find executable %s in path %s", exec, path) + return exec, nil +} + +// Capabilities takes in spec and returns a TaskCapabilities corresponding to +// the spec. +func Capabilities(specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { + var caps auth.TaskCapabilities + if specCaps != nil { + var err error + if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding); err != nil { + return nil, err + } + if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective); err != nil { + return nil, err + } + if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable); err != nil { + return nil, err + } + if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted); err != nil { + return nil, err + } + // TODO: Support ambient capabilities. + } + return &caps, nil +} + +var capFromName = map[string]linux.Capability{ + "CAP_CHOWN": linux.CAP_CHOWN, + "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, + "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, + "CAP_FOWNER": linux.CAP_FOWNER, + "CAP_FSETID": linux.CAP_FSETID, + "CAP_KILL": linux.CAP_KILL, + "CAP_SETGID": linux.CAP_SETGID, + "CAP_SETUID": linux.CAP_SETUID, + "CAP_SETPCAP": linux.CAP_SETPCAP, + "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, + "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, + "CAP_NET_BROAD_CAST": linux.CAP_NET_BROAD_CAST, + "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, + "CAP_NET_RAW": linux.CAP_NET_RAW, + "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, + "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, + "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, + "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, + "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, + "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, + "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, + "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, + "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, + "CAP_SYS_NICE": linux.CAP_SYS_NICE, + "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, + "CAP_SYS_TIME": linux.CAP_SYS_TIME, + "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, + "CAP_MKNOD": linux.CAP_MKNOD, + "CAP_LEASE": linux.CAP_LEASE, + "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, + "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, + "CAP_SETFCAP": linux.CAP_SETFCAP, + "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, + "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, + "CAP_SYSLOG": linux.CAP_SYSLOG, + "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, + "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, +} + +func capsFromNames(names []string) (auth.CapabilitySet, error) { + var caps []linux.Capability + for _, n := range names { + c, ok := capFromName[n] + if !ok { + return 0, fmt.Errorf("unknown capability %q", n) + } + caps = append(caps, c) + } + return auth.CapabilitySetOfMany(caps), nil +} + +// Is9PMount returns true if the given mount can be mounted as an external gofer. +func Is9PMount(m specs.Mount) bool { + return m.Type == "bind" && m.Source != "" && !strings.HasPrefix(m.Destination, "/dev") +} + +// BinPath returns the real path to self, resolving symbolink links. This is done +// to make the process name appears as 'runsc', instead of 'exe'. +func BinPath() (string, error) { + binPath, err := filepath.EvalSymlinks("/proc/self/exe") + if err != nil { + return "", fmt.Errorf(`error resolving "/proc/self/exe" symlink: %v`, err) + } + return binPath, nil +} |