diff options
Diffstat (limited to 'runsc')
40 files changed, 1770 insertions, 663 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 040f6a72d..2d9517f4a 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -26,10 +26,13 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/memutil", @@ -106,6 +109,7 @@ go_library( "//runsc/boot/pprof", "//runsc/config", "//runsc/specutils", + "//runsc/specutils/seccomp", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@org_golang_x_sys//unix:go_default_library", @@ -123,6 +127,7 @@ go_test( library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fd", "//pkg/fspath", "//pkg/log", "//pkg/p9", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 68a2b45cf..894651519 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -22,6 +22,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -257,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // All validation passed, logs the spec for debugging. specutils.LogSpec(args.Spec) - err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files) + fds, err := fd.NewFromFiles(args.FilePayload.Files) if err != nil { + return err + } + defer func() { + for _, fd := range fds { + _ = fd.Close() + } + }() + if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil { log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) return err } log.Debugf("Container %q started", args.CID) - return nil } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 149eb0b1b..6ac19668f 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -27,41 +27,30 @@ import ( // allowedSyscalls is the set of syscalls executed by the Sentry to the host OS. var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_CLOCK_GETTIME: {}, - syscall.SYS_CLONE: []seccomp.Rule{ - { - seccomp.AllowValue( - syscall.CLONE_VM | - syscall.CLONE_FS | - syscall.CLONE_FILES | - syscall.CLONE_SIGHAND | - syscall.CLONE_SYSVSEM | - syscall.CLONE_THREAD), - }, - }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, syscall.SYS_DUP3: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_CLOEXEC), }, }, syscall.SYS_EPOLL_CREATE1: {}, syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_EVENTFD2: []seccomp.Rule{ { - seccomp.AllowValue(0), - seccomp.AllowValue(0), + seccomp.EqualTo(0), + seccomp.EqualTo(0), }, }, syscall.SYS_EXIT: {}, @@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FCHMOD: {}, syscall.SYS_FCNTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_SETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_SETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFD), }, }, syscall.SYS_FSTAT: {}, @@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FTRUNCATE: {}, syscall.SYS_FUTEX: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, }, // Non-private variants are included for flipcall support. They are otherwise // unncessary, as the sentry will use only private futexes internally. { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, { - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE), - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE), + seccomp.MatchAny{}, }, }, syscall.SYS_GETPID: {}, unix.SYS_GETRANDOM: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_DOMAIN), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_DOMAIN), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, }, syscall.SYS_GETTID: {}, @@ -141,34 +130,34 @@ var allowedSyscalls = seccomp.SyscallRules{ // setting/getting termios and winsize. syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCGETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCGETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETS), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETS), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSF), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSF), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TCSETSW), - seccomp.AllowAny{}, /* termios struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TCSETSW), + seccomp.MatchAny{}, /* termios struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCSWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCSWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, { - seccomp.AllowAny{}, /* fd */ - seccomp.AllowValue(linux.TIOCGWINSZ), - seccomp.AllowAny{}, /* winsize struct */ + seccomp.MatchAny{}, /* fd */ + seccomp.EqualTo(linux.TIOCGWINSZ), + seccomp.MatchAny{}, /* winsize struct */ }, }, syscall.SYS_LSEEK: {}, @@ -182,46 +171,46 @@ var allowedSyscalls = seccomp.SyscallRules{ // TODO(b/148688965): Remove once this is gone from Go. syscall.SYS_MLOCK: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(4096), + seccomp.MatchAny{}, + seccomp.EqualTo(4096), }, }, syscall.SYS_MMAP: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_SHARED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_SHARED), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ), - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ), + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), }, }, syscall.SYS_MPROTECT: {}, @@ -237,32 +226,32 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), }, }, syscall.SYS_RECVMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(fdbased.MaxMsgsPerRecv), - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(fdbased.MaxMsgsPerRecv), + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, unix.SYS_SENDMMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT), - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT), + seccomp.EqualTo(0), }, }, syscall.SYS_RESTART_SYSCALL: {}, @@ -272,49 +261,49 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SCHED_YIELD: {}, syscall.SYS_SENDMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), }, }, syscall.SYS_SETITIMER: {}, syscall.SYS_SHUTDOWN: []seccomp.Rule{ // Used by fs/host to shutdown host sockets. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)}, - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)}, // Used by unet to shutdown connections. - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TEE: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(1), /* len */ - seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(1), /* len */ + seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */ }, }, syscall.SYS_TGKILL: []seccomp.Rule{ { - seccomp.AllowValue(uint64(os.Getpid())), + seccomp.EqualTo(uint64(os.Getpid())), }, }, syscall.SYS_UTIMENSAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* null pathname */ - seccomp.AllowAny{}, - seccomp.AllowValue(0), /* flags */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* null pathname */ + seccomp.MatchAny{}, + seccomp.EqualTo(0), /* flags */ }, }, syscall.SYS_WRITE: {}, // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, seccomp.GreaterThan(0), }, }, @@ -325,10 +314,10 @@ func hostInetFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT4: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), }, }, syscall.SYS_BIND: {}, @@ -337,84 +326,84 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_GETSOCKNAME: {}, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_ERROR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_ERROR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_KEEPALIVE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_KEEPALIVE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_TYPE), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_TYPE), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_LINGER), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_LINGER), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_INFO), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_INFO), }, }, syscall.SYS_IOCTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCOUTQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCOUTQ), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.TIOCINQ), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.TIOCINQ), }, }, syscall.SYS_LISTEN: {}, @@ -425,103 +414,103 @@ func hostInetFilters() seccomp.SyscallRules { syscall.SYS_SENDTO: {}, syscall.SYS_SETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_V6ONLY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_V6ONLY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_SNDBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_SNDBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_RCVBUF), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_RCVBUF), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_REUSEADDR), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_REUSEADDR), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_TCP), - seccomp.AllowValue(syscall.TCP_NODELAY), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_TCP), + seccomp.EqualTo(syscall.TCP_NODELAY), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_TOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_TOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IP), - seccomp.AllowValue(syscall.IP_RECVTOS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IP), + seccomp.EqualTo(syscall.IP_RECVTOS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_TCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_TCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_IPV6), - seccomp.AllowValue(syscall.IPV6_RECVTCLASS), - seccomp.AllowAny{}, - seccomp.AllowValue(4), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_IPV6), + seccomp.EqualTo(syscall.IPV6_RECVTCLASS), + seccomp.MatchAny{}, + seccomp.EqualTo(4), }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RD), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_WR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_WR), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SHUT_RDWR), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SHUT_RDWR), }, }, syscall.SYS_SOCKET: []seccomp.Rule{ { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_INET6), - seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_INET6), + seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, }, syscall.SYS_WRITEV: {}, @@ -532,20 +521,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_ACCEPT: []seccomp.Rule{ { - seccomp.AllowValue(fd), + seccomp.EqualTo(fd), }, }, syscall.SYS_LISTEN: []seccomp.Rule{ { - seccomp.AllowValue(fd), - seccomp.AllowValue(16 /* unet.backlog */), + seccomp.EqualTo(fd), + seccomp.EqualTo(16 /* unet.backlog */), }, }, syscall.SYS_GETSOCKOPT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.SOL_SOCKET), - seccomp.AllowValue(syscall.SO_PEERCRED), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.SOL_SOCKET), + seccomp.EqualTo(syscall.SO_PEERCRED), }, }, } diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go index 5335ff82c..cea5613b8 100644 --- a/runsc/boot/filter/config_amd64.go +++ b/runsc/boot/filter/config_amd64.go @@ -24,8 +24,41 @@ import ( ) func init() { - allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL], - seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)}, - seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)}, - ) + allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ + // TODO(b/168828518): No longer used in Go 1.16+. + {seccomp.EqualTo(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SETTLS | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + { + // TODO(b/168828518): No longer used in Go 1.16+ (on amd64). + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + } } diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go index 7fa9bbda3..37313f97f 100644 --- a/runsc/boot/filter/config_arm64.go +++ b/runsc/boot/filter/config_arm64.go @@ -16,6 +16,29 @@ package filter -// Reserve for future customization. +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/seccomp" +) + func init() { + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + // These arguments are left uninitialized by the Go + // runtime, so they may be anything (and are unused by + // the host). + seccomp.MatchAny{}, // parent_tidptr + seccomp.MatchAny{}, // tls + seccomp.MatchAny{}, // child_tidptr + }, + } } diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go index 194952a7b..7b8669595 100644 --- a/runsc/boot/filter/config_profile.go +++ b/runsc/boot/filter/config_profile.go @@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules { return seccomp.SyscallRules{ syscall.SYS_OPENAT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC), }, }, } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 163265afe..ddf288456 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -34,6 +34,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" @@ -253,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem { // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. -func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { // Construct a ramfs tree of mount points. The contents never // change, so this can be fully caching. There's no real // filesystem backing this tree, so we set the filesystem to @@ -263,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string if err != nil { return nil, fmt.Errorf("creating mount tree: %v", err) } - overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) if err != nil { return nil, fmt.Errorf("adding mount overlay: %v", err) } @@ -320,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error { } type fdDispenser struct { - fds []int + fds []*fd.FD } func (f *fdDispenser) remove() int { if f.empty() { panic("fdDispenser out of fds") } - rv := f.fds[0] + rv := f.fds[0].Release() f.fds = f.fds[1:] return rv } @@ -453,17 +454,17 @@ func (m *mountHint) isSupported() bool { func (m *mountHint) checkCompatible(mount specs.Mount) error { // Remove options that don't affect to mount's behavior. masterOpts := filterUnsupportedOptions(m.mount) - slaveOpts := filterUnsupportedOptions(mount) + replicaOpts := filterUnsupportedOptions(mount) - if len(masterOpts) != len(slaveOpts) { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if len(masterOpts) != len(replicaOpts) { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } sort.Strings(masterOpts) - sort.Strings(slaveOpts) + sort.Strings(replicaOpts) for i, opt := range masterOpts { - if opt != slaveOpts[i] { - return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts) + if opt != replicaOpts[i] { + return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } } return nil @@ -564,7 +565,7 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter { +func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter { return &containerMounter{ root: spec.Root, mounts: compileMounts(spec), @@ -740,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") - rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) if err != nil { return nil, fmt.Errorf("adding submount overlay: %v", err) } @@ -850,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi submounts := subtargets(m.Destination, c.mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) - inode, err = addSubmountOverlay(ctx, inode, submounts) + inode, err = addSubmountOverlay(ctx, inode, submounts, mf) if err != nil { return fmt.Errorf("adding submount overlay: %v", err) } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index c3c754046..2e652ddad 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -27,8 +27,10 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" @@ -69,6 +71,7 @@ import ( "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" + "gvisor.dev/gvisor/runsc/specutils/seccomp" // Include supported socket providers. "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" @@ -89,10 +92,10 @@ type containerInfo struct { procArgs kernel.CreateProcessArgs // stdioFDs contains stdin, stdout, and stderr. - stdioFDs []int + stdioFDs []*fd.FD // goferFDs are the FDs that attach the sandbox to the gofers. - goferFDs []int + goferFDs []*fd.FD } // Loader keeps state needed to start the kernel and run the container.. @@ -356,12 +359,17 @@ func New(args Args) (*Loader, error) { k.SetHostMount(hostMount) } + info := containerInfo{ + conf: args.Conf, + spec: args.Spec, + procArgs: procArgs, + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. - var stdioFDs []int newfd := startingStdioFD - for _, fd := range args.StdioFDs { + for _, stdioFD := range args.StdioFDs { // Check that newfd is unused to avoid clobbering over it. if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) { if err != nil { @@ -370,14 +378,17 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd) } - err := unix.Dup3(fd, newfd, unix.O_CLOEXEC) + err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC) if err != nil { - return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err) + return nil, fmt.Errorf("dup3 of stdios failed: %w", err) } - stdioFDs = append(stdioFDs, newfd) - _ = unix.Close(fd) + info.stdioFDs = append(info.stdioFDs, fd.New(newfd)) + _ = unix.Close(stdioFD) newfd++ } + for _, goferFD := range args.GoferFDs { + info.goferFDs = append(info.goferFDs, fd.New(goferFD)) + } eid := execID{cid: args.ID} l := &Loader{ @@ -386,13 +397,7 @@ func New(args Args) (*Loader, error) { sandboxID: args.ID, processes: map[execID]*execProcess{eid: {}}, mountHints: mountHints, - root: containerInfo{ - conf: args.Conf, - stdioFDs: stdioFDs, - goferFDs: args.GoferFDs, - spec: args.Spec, - procArgs: procArgs, - }, + root: info, } // We don't care about child signals; some platforms can generate a @@ -466,9 +471,14 @@ func (l *Loader) Destroy() { } l.watchdog.Stop() - for i, fd := range l.root.stdioFDs { - _ = unix.Close(fd) - l.root.stdioFDs[i] = -1 + // In the success case, stdioFDs and goferFDs will only contain + // released/closed FDs that ownership has been passed over to host FDs and + // gofer sessions. Close them here in case on failure. + for _, fd := range l.root.stdioFDs { + _ = fd.Close() + } + for _, fd := range l.root.goferFDs { + _ = fd.Close() } } @@ -499,6 +509,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return mf, nil } +// installSeccompFilters installs sandbox seccomp filters with the host. func (l *Loader) installSeccompFilters() error { if l.root.conf.DisableSeccomp { filter.Report("syscall filter is DISABLED. Running in less secure mode.") @@ -569,6 +580,7 @@ func (l *Loader) run() error { if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil { return err } + } ep.tg = l.k.GlobalInit() @@ -598,17 +610,6 @@ func (l *Loader) run() error { } }) - // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again - // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the - // passed FDs, so only close for VFS1. - if !kernel.VFS2Enabled { - for i, fd := range l.root.stdioFDs { - _ = unix.Close(fd) - l.root.stdioFDs[i] = -1 - } - } - log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() @@ -628,9 +629,9 @@ func (l *Loader) createContainer(cid string) error { } // startContainer starts a child container. It returns the thread group ID of -// the newly created process. Caller owns 'files' and may close them after -// this method returns. -func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*os.File) error { +// the newly created process. Used FDs are either closed or released. It's safe +// for the caller to close any remaining files upon return. +func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error { // Create capabilities. caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities) if err != nil { @@ -681,28 +682,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin } info := &containerInfo{ - conf: conf, - spec: spec, + conf: conf, + spec: spec, + stdioFDs: files[:3], + goferFDs: files[3:], } info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns) if err != nil { return fmt.Errorf("creating new process: %v", err) } - - // setupContainerFS() dups stdioFDs, so we don't need to dup them here. - for _, f := range files[:3] { - info.stdioFDs = append(info.stdioFDs, int(f.Fd())) - } - - // Can't take ownership away from os.File. dup them to get a new FDs. - for _, f := range files[3:] { - fd, err := unix.Dup(int(f.Fd())) - if err != nil { - return fmt.Errorf("failed to dup file: %v", err) - } - info.goferFDs = append(info.goferFDs, fd) - } - tg, err := l.createContainerProcess(false, cid, info, ep) if err != nil { return err @@ -780,19 +768,44 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn } } + // Install seccomp filters with the new task if there are any. + if info.conf.OCISeccomp { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp) + if err != nil { + return nil, fmt.Errorf("building seccomp program: %v", err) + } + + if log.IsLogging(log.Debug) { + out, _ := bpf.DecodeProgram(program) + log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out) + } + + task := tg.Leader() + // NOTE: It seems Flags are ignored by runc so we ignore them too. + if err := task.AppendSyscallFilter(program, true); err != nil { + return nil, fmt.Errorf("appending seccomp filters: %v", err) + } + } + } else { + if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil { + log.Warningf("Seccomp spec is being ignored") + } + } + return tg, nil } // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on // the gofer FDs looking for disconnects, and destroys the container if a // disconnect occurs in any of the gofer FDs. -func (l *Loader) startGoferMonitor(cid string, goferFDs []int) { +func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) { go func() { log.Debugf("Monitoring gofer health for container %q", cid) var events []unix.PollFd - for _, fd := range goferFDs { + for _, goferFD := range goferFDs { events = append(events, unix.PollFd{ - Fd: int32(fd), + Fd: int32(goferFD.FD()), Events: unix.POLLHUP | unix.POLLRDHUP, }) } @@ -1046,8 +1059,8 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st } func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol} + transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4} s := netstack.Stack{stack.New(stack.Options{ NetworkProtocols: netProtos, TransportProtocols: transProtos, @@ -1061,22 +1074,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in })} // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %s", err) + { + opt := tcpip.TCPSACKEnabled(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } // Set default TTLs as required by socket/netstack. - opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) - if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { - return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) - } - if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { - return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) + { + opt := tcpip.DefaultTTLOption(netstack.DefaultTTL) + if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err) + } + if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err) + } } // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err) + { + opt := tcpip.TCPModerateReceiveBufferOption(true) + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err) + } } return &s, nil @@ -1272,7 +1293,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2 return ep.tty, ep.ttyVFS2, nil } -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { +func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { if len(stdioFDs) != 3 { return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 2343ce76c..bf9ec5d38 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -26,6 +26,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" @@ -444,7 +445,7 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{}) + mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}) mns, err := mntr.createMountNamespace(ctx, conf) if err != nil { t.Fatalf("failed to create mount namespace: %v", err) @@ -490,9 +491,9 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } ctx := l.k.SupervisorContext() - mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs) + mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs) if err != nil { - t.Fatalf("failed to setupVFS2: %v", err) + t.Fatalf("mountAll: %v", err) } root := mns.Root() @@ -702,7 +703,11 @@ func TestRestoreEnvironment(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { conf := testConfig() - mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{}) + var ioFDs []*fd.FD + for _, ioFD := range tc.ioFDs { + ioFDs = append(ioFDs, fd.New(ioFD)) + } + mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 66b6cf19b..e36664938 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -21,6 +21,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" @@ -134,7 +135,7 @@ func registerFilesystems(k *kernel.Kernel) error { } func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - mns, err := mntr.setupVFS2(ctx, conf, procArgs) + mns, err := mntr.mountAll(conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) } @@ -149,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containe return nil } -func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { +func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { log.Infof("Configuring container's file system with VFS2") // Create context with root credentials to mount the filesystem (the current @@ -168,34 +169,108 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, p } rootProcArgs.MountNamespaceVFS2 = mns + root := mns.Root() + defer root.DecRef(rootCtx) + if root.Mount().ReadOnly() { + // Switch to ReadWrite while we setup submounts. + if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { + return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) + } + // Restore back to ReadOnly at the end. + defer func() { + if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { + panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) + } + }() + } + // Mount submounts. if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil { return nil, fmt.Errorf("mounting submounts vfs2: %w", err) } + return mns, nil } +// createMountNamespaceVFS2 creates the container's root mount and namespace. func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { fd := c.fds.remove() - opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */) + data := p9MountData(fd, conf.FileAccess, true /* vfs2 */) if conf.OverlayfsStaleRead { // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing // Options field). So assume root is always on top of overlayfs. - opts = append(opts, "overlayfs_stale_read") + data = append(data, "overlayfs_stale_read") } log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{ - Data: strings.Join(opts, ","), - }) + opts := &vfs.MountOptions{ + ReadOnly: c.root.Readonly, + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(data, ","), + }, + InternalMount: true, + } + + fsName := gofer.Name + if conf.Overlay && !c.root.Readonly { + log.Infof("Adding overlay on top of root") + var err error + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting root with overlay: %w", err) + } + defer cleanup() + fsName = overlay.Name + } + + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts) if err != nil { return nil, fmt.Errorf("setting up mount namespace: %w", err) } return mns, nil } +// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper +// layer using tmpfs, and return overlay mount options. "cleanup" must be called +// after the options have been used to mount the overlay, to release refs on +// lower and upper mounts. +func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) { + // First copy options from lower layer to upper layer and overlay. Clear + // filesystem specific options. + upperOpts := *lowerOpts + upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + overlayOpts := *lowerOpts + overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} + + // Next mount upper and lower. Upper is a tmpfs mount to keep all + // modifications inside the sandbox. + upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) + } + cu := cleanup.Make(func() { upper.DecRef(ctx) }) + defer cu.Clean() + + // All writes go to the upper layer, be paranoid and make lower readonly. + lowerOpts.ReadOnly = true + lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) + if err != nil { + return nil, nil, err + } + cu.Add(func() { lower.DecRef(ctx) }) + + // Configure overlay with both layers. + overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ + UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()), + LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())}, + } + return &overlayOpts, cu.Release(), nil +} + func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { mounts, err := c.prepareMountsVFS2() if err != nil { @@ -225,8 +300,9 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config. if mnt != nil && mnt.ReadOnly() { // Switch to ReadWrite while we setup submounts. if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { - return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err) + return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err) } + // Restore back to ReadOnly at the end. defer func() { if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err)) @@ -276,14 +352,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { } func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) { - root := mns.Root() - defer root.DecRef(ctx) - target := &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(submount.Destination), - } - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount) + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount) if err != nil { return nil, fmt.Errorf("mountOptions failed: %w", err) } @@ -292,8 +361,27 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C return nil, nil } - if err := c.k.VFS().MakeSyntheticMountpoint(ctx, submount.Destination, root, creds); err != nil { - return nil, err + if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err) + } + + if useOverlay { + log.Infof("Adding overlay on top of mount %q", submount.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), } mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) if err != nil { @@ -305,8 +393,9 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. -func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, error) { +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) { fsName := m.Type + useOverlay := false var data []string // Find filesystem name and FS specific data field. @@ -321,7 +410,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { - return "", nil, err + return "", nil, false, err } case bind: @@ -329,13 +418,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo if m.fd == 0 { // Check that an FD was provided to fails fast. Technically FD=0 is valid, // but unlikely to be correct in this context. - return "", nil, fmt.Errorf("9P mount requires a connection FD") + return "", nil, false, fmt.Errorf("9P mount requires a connection FD") } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + default: log.Warningf("ignoring unknown filesystem type %q", m.Type) - return "", nil, nil + return "", nil, false, nil } opts := &vfs.MountOptions{ @@ -360,11 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo } } - if conf.Overlay { - // All writes go to upper, be paranoid and make lower readonly. - opts.ReadOnly = true - } - return fsName, opts, nil + return fsName, opts, useOverlay, nil } // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so. @@ -467,13 +555,25 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. mntFD := &mountAndFD{Mount: hint.mount} - fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD) if err != nil { return nil, err } if len(fsName) == 0 { return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) } + + if useOverlay { + log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination) + var cleanup func() + opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName) + if err != nil { + return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err) + } + defer cleanup() + fsName = overlay.Name + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) } @@ -484,7 +584,9 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co return nil, err } - _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + // Ignore data and useOverlay because these were already applied to + // the master mount. + _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) if err != nil { return nil, err } @@ -496,18 +598,39 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co root := mns.Root() defer root.DecRef(ctx) - if err := c.k.VFS().MakeSyntheticMountpoint(ctx, mount.Destination, root, creds); err != nil { - return nil, err - } - target := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(mount.Destination), } + + if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil { + return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err) + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { return nil, err } log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) return newMnt, nil } + +func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { + root := mns.Root() + defer root.DecRef(ctx) + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dest), + } + // First check if mount point exists. When overlay is enabled, gofer doesn't + // allow changes to the FS, making MakeSytheticMountpoint() ineffective + // because MkdirAt fails with EROFS even if file exists. + vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) + if err == nil { + // File exists, we're done. + vd.DecRef(ctx) + return nil + } + return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) +} diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 357f46517..cd419e1aa 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -168,7 +168,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Get the spec from the specFD. specFile := os.NewFile(uintptr(b.specFD), "spec file") defer specFile.Close() - spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile) + spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) if err != nil { Fatalf("reading spec: %v", err) } diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index db46d509f..8fe0c427a 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -118,7 +118,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa Fatalf("setting bundleDir") } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { Fatalf("reading spec: %v", err) } diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index 4d9085244..e76f7ba1d 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -91,7 +91,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{} if bundleDir == "" { bundleDir = getwdOrDie() } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return Errorf("reading spec: %v", err) } diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go index 600876a27..775ed4b43 100644 --- a/runsc/cmd/exec.go +++ b/runsc/cmd/exec.go @@ -220,7 +220,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi cmd.Stderr = os.Stderr // If the console control socket file is provided, then create a new - // pty master/slave pair and set the TTY on the sandbox process. + // pty master/replica pair and set the TTY on the sandbox process. if ex.consoleSocket != "" { // Create a new TTY pair and send the master on the provided socket. tty, err := console.NewWithSocket(ex.consoleSocket) @@ -229,7 +229,7 @@ func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.Exi } defer tty.Close() - // Set stdio to the new TTY slave. + // Set stdio to the new TTY replica. cmd.Stdin = tty cmd.Stdout = tty cmd.Stderr = tty diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 7da02c3af..371fcc0ae 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -62,9 +62,8 @@ type Gofer struct { applyCaps bool setUpRoot bool - panicOnWrite bool - specFD int - mountsFD int + specFD int + mountsFD int } // Name implements subcommands.Command. @@ -87,7 +86,6 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) { f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") f.Var(&g.ioFDs, "io-fds", "list of FDs to connect 9P servers. They must follow this order: root first, then mounts as defined in the spec") f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") - f.BoolVar(&g.panicOnWrite, "panic-on-write", false, "if true, panics on attempts to write to RO mounts. RW mounts are unnaffected") f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") @@ -100,15 +98,15 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) return subcommands.ExitUsageError } + conf := args[0].(*config.Config) + specFile := os.NewFile(uintptr(g.specFD), "spec file") defer specFile.Close() - spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile) + spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf) if err != nil { Fatalf("reading spec: %v", err) } - conf := args[0].(*config.Config) - if g.setUpRoot { if err := setupRootFS(spec, conf); err != nil { Fatalf("Error setting up root FS: %v", err) @@ -168,8 +166,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) // Start with root mount, then add any other additional mount as needed. ats := make([]p9.Attacher, 0, len(spec.Mounts)+1) ap, err := fsgofer.NewAttachPoint("/", fsgofer.Config{ - ROMount: spec.Root.Readonly || conf.Overlay, - PanicOnWrite: g.panicOnWrite, + ROMount: spec.Root.Readonly || conf.Overlay, }) if err != nil { Fatalf("creating attach point: %v", err) @@ -181,9 +178,8 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) for _, m := range spec.Mounts { if specutils.Is9PMount(m) { cfg := fsgofer.Config{ - ROMount: isReadonlyMount(m.Options) || conf.Overlay, - PanicOnWrite: g.panicOnWrite, - HostUDS: conf.FSGoferHostUDS, + ROMount: isReadonlyMount(m.Options) || conf.Overlay, + HostUDS: conf.FSGoferHostUDS, } ap, err := fsgofer.NewAttachPoint(m.Destination, cfg) if err != nil { @@ -316,6 +312,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { if err != nil { return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) } + log.Infof("Create working directory %q if needed", spec.Process.Cwd) if err := os.MkdirAll(dst, 0755); err != nil { return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) } diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go index b16975804..096ec814c 100644 --- a/runsc/cmd/restore.go +++ b/runsc/cmd/restore.go @@ -88,7 +88,7 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{ if bundleDir == "" { bundleDir = getwdOrDie() } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return Errorf("reading spec: %v", err) } diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go index 1161de67a..c48cbe4cd 100644 --- a/runsc/cmd/run.go +++ b/runsc/cmd/run.go @@ -75,7 +75,7 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s if bundleDir == "" { bundleDir = getwdOrDie() } - spec, err := specutils.ReadSpec(bundleDir) + spec, err := specutils.ReadSpec(bundleDir, conf) if err != nil { return Errorf("reading spec: %v", err) } diff --git a/runsc/config/config.go b/runsc/config/config.go index bca27ebf1..f30f79f68 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -157,6 +157,12 @@ type Config struct { // Enables FUSE usage. FUSE bool `flag:"fuse"` + // Allows overriding of flags in OCI annotations. + AllowFlagOverride bool `flag:"allow-flag-override"` + + // Enables seccomp inside the sandbox. + OCISeccomp bool `flag:"oci-seccomp"` + // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in // tests. It allows runsc to start the sandbox process as the current // user, and without chrooting the sandbox process. This can be diff --git a/runsc/config/config_test.go b/runsc/config/config_test.go index af7867a2a..fb162b7eb 100644 --- a/runsc/config/config_test.go +++ b/runsc/config/config_test.go @@ -183,3 +183,90 @@ func TestValidationFail(t *testing.T) { }) } } + +func TestOverride(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + c.AllowFlagOverride = true + + t.Run("string", func(t *testing.T) { + c.RootDir = "foobar" + if err := c.Override("root", "bar"); err != nil { + t.Fatalf("Override(root, bar) failed: %v", err) + } + defer setDefault("root") + if c.RootDir != "bar" { + t.Errorf("Override(root, bar) didn't work: %+v", c) + } + }) + + t.Run("bool", func(t *testing.T) { + c.Debug = true + if err := c.Override("debug", "false"); err != nil { + t.Fatalf("Override(debug, false) failed: %v", err) + } + defer setDefault("debug") + if c.Debug { + t.Errorf("Override(debug, false) didn't work: %+v", c) + } + }) + + t.Run("enum", func(t *testing.T) { + c.FileAccess = FileAccessShared + if err := c.Override("file-access", "exclusive"); err != nil { + t.Fatalf("Override(file-access, exclusive) failed: %v", err) + } + defer setDefault("file-access") + if c.FileAccess != FileAccessExclusive { + t.Errorf("Override(file-access, exclusive) didn't work: %+v", c) + } + }) +} + +func TestOverrideDisabled(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + const errMsg = "flag override disabled" + if err := c.Override("root", "path"); err == nil || !strings.Contains(err.Error(), errMsg) { + t.Errorf("Override() wrong error: %v", err) + } +} + +func TestOverrideError(t *testing.T) { + c, err := NewFromFlags() + if err != nil { + t.Fatal(err) + } + c.AllowFlagOverride = true + for _, tc := range []struct { + name string + value string + error string + }{ + { + name: "invalid", + value: "valid", + error: `flag "invalid" not found`, + }, + { + name: "debug", + value: "invalid", + error: "error setting flag debug", + }, + { + name: "file-access", + value: "invalid", + error: "invalid file access type", + }, + } { + t.Run(tc.name, func(t *testing.T) { + if err := c.Override(tc.name, tc.value); err == nil || !strings.Contains(err.Error(), tc.error) { + t.Errorf("Override(%q, %q) wrong error: %v", tc.name, tc.value, err) + } + }) + } +} diff --git a/runsc/config/flags.go b/runsc/config/flags.go index 488a4b9fb..a5f25cfa2 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -48,6 +48,7 @@ func RegisterFlags() { flag.Bool("log-packets", false, "enable network packet logging.") flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s.") flag.Bool("alsologtostderr", false, "send log messages to stderr.") + flag.Bool("allow-flag-override", false, "allow OCI annotations (dev.gvisor.flag.<name>) to override flags for debugging.") // Debugging flags: strace related flag.Bool("strace", false, "enable strace.") @@ -62,6 +63,7 @@ func RegisterFlags() { flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.") flag.Var(leakModePtr(refs.NoLeakChecking), "ref-leak-mode", "sets reference leak check mode: disabled (default), log-names, log-traces.") flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") + flag.Bool("oci-seccomp", false, "Enables loading OCI seccomp filters inside the sandbox.") // Flags that control sandbox runtime behavior: FS related. flag.Var(fileAccessTypePtr(FileAccessExclusive), "file-access", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") @@ -149,6 +151,41 @@ func (c *Config) ToFlags() []string { return rv } +// Override writes a new value to a flag. +func (c *Config) Override(name string, value string) error { + if !c.AllowFlagOverride { + return fmt.Errorf("flag override disabled, use --allow-flag-override to enable it") + } + + obj := reflect.ValueOf(c).Elem() + st := obj.Type() + for i := 0; i < st.NumField(); i++ { + f := st.Field(i) + fieldName, ok := f.Tag.Lookup("flag") + if !ok || fieldName != name { + // Not a flag field, or flag name doesn't match. + continue + } + fl := flag.CommandLine.Lookup(name) + if fl == nil { + // Flag must exist if there is a field match above. + panic(fmt.Sprintf("Flag %q not found", name)) + } + + // Use flag to convert the string value to the underlying flag type, using + // the same rules as the command-line for consistency. + if err := fl.Value.Set(value); err != nil { + return fmt.Errorf("error setting flag %s=%q: %w", name, value, err) + } + x := reflect.ValueOf(flag.Get(fl.Value)) + obj.Field(i).Set(x) + + // Validates the config again to ensure it's left in a consistent state. + return c.validate() + } + return fmt.Errorf("flag %q not found. Cannot set it to %q", name, value) +} + func getVal(field reflect.Value) string { if str, ok := field.Addr().Interface().(fmt.Stringer); ok { return str.String() diff --git a/runsc/console/console.go b/runsc/console/console.go index 64b23639a..dbb88e117 100644 --- a/runsc/console/console.go +++ b/runsc/console/console.go @@ -24,11 +24,11 @@ import ( "golang.org/x/sys/unix" ) -// NewWithSocket creates pty master/slave pair, sends the master FD over the given -// socket, and returns the slave. +// NewWithSocket creates pty master/replica pair, sends the master FD over the given +// socket, and returns the replica. func NewWithSocket(socketPath string) (*os.File, error) { - // Create a new pty master and slave. - ptyMaster, ptySlave, err := pty.Open() + // Create a new pty master and replica. + ptyMaster, ptyReplica, err := pty.Open() if err != nil { return nil, fmt.Errorf("opening pty: %v", err) } @@ -37,18 +37,18 @@ func NewWithSocket(socketPath string) (*os.File, error) { // Get a connection to the socket path. conn, err := net.Dial("unix", socketPath) if err != nil { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("dialing socket %q: %v", socketPath, err) } defer conn.Close() uc, ok := conn.(*net.UnixConn) if !ok { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("connection is not a UnixConn: %T", conn) } socket, err := uc.File() if err != nil { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("getting file for unix socket %v: %v", uc, err) } defer socket.Close() @@ -56,8 +56,8 @@ func NewWithSocket(socketPath string) (*os.File, error) { // Send the master FD over the connection. msg := unix.UnixRights(int(ptyMaster.Fd())) if err := unix.Sendmsg(int(socket.Fd()), []byte("pty-master"), msg, nil, 0); err != nil { - ptySlave.Close() + ptyReplica.Close() return nil, fmt.Errorf("sending console over unix socket %q: %v", socketPath, err) } - return ptySlave, nil + return ptyReplica, nil } diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 995d4e267..4228399b8 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -185,14 +185,14 @@ func TestJobControlSignalExec(t *testing.T) { t.Fatalf("error starting container: %v", err) } - // Create a pty master/slave. The slave will be passed to the exec + // Create a pty master/replica. The replica will be passed to the exec // process. - ptyMaster, ptySlave, err := pty.Open() + ptyMaster, ptyReplica, err := pty.Open() if err != nil { t.Fatalf("error opening pty: %v", err) } defer ptyMaster.Close() - defer ptySlave.Close() + defer ptyReplica.Close() // Exec bash and attach a terminal. Note that occasionally /bin/sh // may be a different shell or have a different configuration (such @@ -203,9 +203,9 @@ func TestJobControlSignalExec(t *testing.T) { // Don't let bash execute from profile or rc files, otherwise // our PID counts get messed up. Argv: []string{"/bin/bash", "--noprofile", "--norc"}, - // Pass the pty slave as FD 0, 1, and 2. + // Pass the pty replica as FD 0, 1, and 2. FilePayload: urpc.FilePayload{ - Files: []*os.File{ptySlave, ptySlave, ptySlave}, + Files: []*os.File{ptyReplica, ptyReplica, ptyReplica}, }, StdioIsPty: true, } diff --git a/runsc/container/container.go b/runsc/container/container.go index 6e1d6a568..63478ba8c 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -902,9 +902,6 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu } args = append(args, "gofer", "--bundle", bundleDir) - if conf.Overlay { - args = append(args, "--panic-on-write=true") - } // Open the spec file to donate to the sandbox. specFile, err := specutils.OpenSpec(bundleDir) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 6082068c7..548c68087 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -41,6 +41,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" @@ -292,22 +293,20 @@ var ( func configs(t *testing.T, opts ...configOption) map[string]*config.Config { // Always load the default config. cs := make(map[string]*config.Config) + testutil.TestConfig(t) for _, o := range opts { + c := testutil.TestConfig(t) switch o { case overlay: - c := testutil.TestConfig(t) c.Overlay = true cs["overlay"] = c case ptrace: - c := testutil.TestConfig(t) c.Platform = platforms.Ptrace cs["ptrace"] = c case kvm: - c := testutil.TestConfig(t) c.Platform = platforms.KVM cs["kvm"] = c case nonExclusiveFS: - c := testutil.TestConfig(t) c.FileAccess = config.FileAccessShared cs["non-exclusive"] = c default: @@ -318,22 +317,12 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config { } func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config { - vfs1 := configs(t, opts...) - - var optsVFS2 []configOption - for _, opt := range opts { - // TODO(gvisor.dev/issue/1487): Enable overlay tests. - if opt != overlay { - optsVFS2 = append(optsVFS2, opt) - } - } - - for key, value := range configs(t, optsVFS2...) { + all := configs(t, opts...) + for key, value := range configs(t, opts...) { value.VFS2 = true - vfs1[key+"VFS2"] = value + all[key+"VFS2"] = value } - - return vfs1 + return all } // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. @@ -512,7 +501,7 @@ func TestExePath(t *testing.T) { t.Fatalf("error making directory: %v", err) } - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { for _, test := range []struct { path string @@ -837,7 +826,7 @@ func TestExecProcList(t *testing.T) { // TestKillPid verifies that we can signal individual exec'd processes. func TestKillPid(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { @@ -1470,7 +1459,7 @@ func TestRunNonRoot(t *testing.T) { // TestMountNewDir checks that runsc will create destination directory if it // doesn't exit. func TestMountNewDir(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { root, err := ioutil.TempDir(testutil.TmpDir(), "root") if err != nil { @@ -1490,6 +1479,8 @@ func TestMountNewDir(t *testing.T) { Source: srcDir, Type: "bind", }) + // Extra points for creating the mount with a readonly root. + spec.Root.Readonly = true if err := run(spec, conf); err != nil { t.Fatalf("error running sandbox: %v", err) @@ -1499,17 +1490,17 @@ func TestMountNewDir(t *testing.T) { } func TestReadonlyRoot(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { - spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") + spec := testutil.NewSpecWithArgs("sleep", "100") spec.Root.Readonly = true + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } defer cleanup() - // Create, start and wait for the container. args := Args{ ID: testutil.RandomContainerID(), Spec: spec, @@ -1524,12 +1515,82 @@ func TestReadonlyRoot(t *testing.T) { t.Fatalf("error starting container: %v", err) } - ws, err := c.Wait() + // Read mounts to check that root is readonly. + out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", "mount | grep ' / '") + if err != nil || ws != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + t.Logf("root mount: %q", out) + if !strings.Contains(string(out), "(ro)") { + t.Errorf("root not mounted readonly: %q", out) + } + + // Check that file cannot be created. + ws, err = execute(c, "/bin/touch", "/foo") if err != nil { - t.Fatalf("error waiting on container: %v", err) + t.Fatalf("touch file in ro mount: %v", err) } if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) + t.Fatalf("wrong waitStatus: %v", ws) + } + }) + } +} + +func TestReadonlyMount(t *testing.T) { + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + spec := testutil.NewSpecWithArgs("sleep", "100") + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "bind", + Options: []string{"ro"}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() + + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Read mounts to check that volume is readonly. + cmd := fmt.Sprintf("mount | grep ' %s '", dir) + out, ws, err := executeCombinedOutput(c, "/bin/sh", "-c", cmd) + if err != nil || ws != 0 { + t.Fatalf("exec failed, ws: %v, err: %v", ws, err) + } + t.Logf("mount: %q", out) + if !strings.Contains(string(out), "(ro)") { + t.Errorf("volume not mounted readonly: %q", out) + } + + // Check that file cannot be created. + ws, err = execute(c, "/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("touch file in ro mount: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("wrong WaitStatus: %v", ws) } }) } @@ -1616,54 +1677,6 @@ func TestUIDMap(t *testing.T) { } } -func TestReadonlyMount(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { - t.Run(name, func(t *testing.T) { - dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") - spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: dir, - Source: dir, - Type: "bind", - Options: []string{"ro"}, - }) - spec.Root.Readonly = false - - _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer cleanup() - - // Create, start and wait for the container. - args := Args{ - ID: testutil.RandomContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } - - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) - } - }) - } -} - // TestAbbreviatedIDs checks that runsc supports using abbreviated container // IDs in place of full IDs. func TestAbbreviatedIDs(t *testing.T) { @@ -1830,8 +1843,9 @@ func TestUserLog(t *testing.T) { t.Fatal("error finding test_app:", err) } - // sched_rr_get_interval = 148 - not implemented in gvisor. - spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148") + // sched_rr_get_interval - not implemented in gvisor. + num := strconv.Itoa(syscall.SYS_SCHED_RR_GET_INTERVAL) + spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall="+num) conf := testutil.TestConfig(t) _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { @@ -2013,7 +2027,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) { } func TestCreateWorkingDir(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create") if err != nil { @@ -2116,27 +2130,19 @@ func TestMountPropagation(t *testing.T) { // Check that mount didn't propagate to private mount. privFile := filepath.Join(priv, "mnt", "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "!", "-f", privFile}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "!", "-f", privFile); err != nil || ws != 0 { t.Fatalf("exec: test ! -f %q, ws: %v, err: %v", privFile, ws, err) } // Check that mount propagated to slave mount. slaveFile := filepath.Join(slave, "mnt", "file") - execArgs = &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", slaveFile}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "-f", slaveFile); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", privFile, ws, err) } } func TestMountSymlink(t *testing.T) { - for name, conf := range configsWithVFS2(t, overlay) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink") if err != nil { @@ -2196,11 +2202,7 @@ func TestMountSymlink(t *testing.T) { // Check that symlink was resolved and mount was created where the symlink // is pointing to. file := path.Join(target, "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", file}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + if ws, err := execute(cont, "/usr/bin/test", "-f", file); err != nil || ws != 0 { t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) } }) @@ -2326,6 +2328,35 @@ func TestTTYField(t *testing.T) { } } +func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) { + args := &control.ExecArgs{ + Filename: name, + Argv: append([]string{name}, arg...), + } + return cont.executeSync(args) +} + +func executeCombinedOutput(cont *Container, name string, arg ...string) ([]byte, syscall.WaitStatus, error) { + r, w, err := os.Pipe() + if err != nil { + return nil, 0, err + } + defer r.Close() + + args := &control.ExecArgs{ + Filename: name, + Argv: append([]string{name}, arg...), + FilePayload: urpc.FilePayload{Files: []*os.File{os.Stdin, w, w}}, + } + ws, err := cont.executeSync(args) + w.Close() + if err != nil { + return nil, 0, err + } + out, err := ioutil.ReadAll(r) + return out, ws, err +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 1beea123f..952215ec1 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -169,7 +169,7 @@ func TestMultiContainerSanity(t *testing.T) { // TestMultiPIDNS checks that it is possible to run 2 dead-simple // containers in the same sandbox with different pidns. func TestMultiPIDNS(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -214,7 +214,7 @@ func TestMultiPIDNS(t *testing.T) { // TestMultiPIDNSPath checks the pidns path. func TestMultiPIDNSPath(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -580,7 +580,7 @@ func TestMultiContainerDestroy(t *testing.T) { t.Fatal("error finding test_app:", err) } - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1252,8 +1252,7 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { // Test that shared pod mounts continue to work after container is restarted. func TestMultiContainerSharedMountRestart(t *testing.T) { - //TODO(gvisor.dev/issue/1487): This is failing with VFS2. - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1360,7 +1359,7 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { } // Test that unsupported pod mounts options are ignored when matching master and -// slave mounts. +// replica mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { @@ -1518,8 +1517,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { } // Check that container isn't running anymore. - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err == nil { + if _, err := execute(c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } @@ -1534,8 +1532,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { if err := waitForProcessList(c, pl); err != nil { t.Errorf("Container %q was affected by another container: %v", c.ID, err) } - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err != nil { + if _, err := execute(c, "/bin/true"); err != nil { t.Fatalf("Container %q was affected by another container: %v", c.ID, err) } } @@ -1557,8 +1554,7 @@ func TestMultiContainerGoferKilled(t *testing.T) { // Check that entire sandbox isn't running anymore. for _, c := range containers { - args := &control.ExecArgs{Argv: []string{"/bin/true"}} - if _, err := c.executeSync(args); err == nil { + if _, err := execute(c, "/bin/true"); err == nil { t.Fatalf("Container %q was not stopped after gofer death", c.ID) } } @@ -1720,12 +1716,11 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { homeDirs[name] = homeFile } - // We will sleep in the root container in order to ensure that - // the root container doesn't terminate before sub containers can be - // created. + // We will sleep in the root container in order to ensure that the root + //container doesn't terminate before sub containers can be created. rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())} subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())} - execCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())} + execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name()) // Setup the containers, a root container and sub container. specConfig, ids := createSpecs(rootCmd, subCmd) @@ -1736,9 +1731,8 @@ func TestMultiContainerHomeEnvDir(t *testing.T) { defer cleanup() // Exec into the root container synchronously. - args := &control.ExecArgs{Argv: execCmd} - if _, err := containers[0].executeSync(args); err != nil { - t.Errorf("error executing %+v: %v", args, err) + if _, err := execute(containers[0], "/bin/sh", "-c", execCmd); err != nil { + t.Errorf("error executing %+v: %v", execCmd, err) } // Wait for the subcontainer to finish. diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go index 4ea8fefee..cb5bffb89 100644 --- a/runsc/container/shared_volume_test.go +++ b/runsc/container/shared_volume_test.go @@ -168,11 +168,7 @@ func TestSharedVolume(t *testing.T) { func checkFile(c *Container, filename string, want []byte) error { cpy := filename + ".copy" - argsCp := &control.ExecArgs{ - Filename: "/bin/cp", - Argv: []string{"cp", "-f", filename, cpy}, - } - if _, err := c.executeSync(argsCp); err != nil { + if _, err := execute(c, "/bin/cp", "-f", filename, cpy); err != nil { return fmt.Errorf("unexpected error copying file %q to %q: %v", filename, cpy, err) } got, err := ioutil.ReadFile(cpy) @@ -235,11 +231,7 @@ func TestSharedVolumeFile(t *testing.T) { } // Append to file inside the container and check that content is not lost. - argsAppend := &control.ExecArgs{ - Filename: "/bin/bash", - Argv: []string{"bash", "-c", "echo -n sandbox- >> " + filename}, - } - if _, err := c.executeSync(argsAppend); err != nil { + if _, err := execute(c, "/bin/bash", "-c", "echo -n sandbox- >> "+filename); err != nil { t.Fatalf("unexpected error appending file %q: %v", filename, err) } want = []byte("host-sandbox-") diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 88814b83c..39b8a0b1e 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -27,62 +27,51 @@ import ( var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_ACCEPT: {}, syscall.SYS_CLOCK_GETTIME: {}, - syscall.SYS_CLONE: []seccomp.Rule{ - { - seccomp.AllowValue( - syscall.CLONE_VM | - syscall.CLONE_FS | - syscall.CLONE_FILES | - syscall.CLONE_SIGHAND | - syscall.CLONE_SYSVSEM | - syscall.CLONE_THREAD), - }, - }, - syscall.SYS_CLOSE: {}, - syscall.SYS_DUP: {}, - syscall.SYS_EPOLL_CTL: {}, + syscall.SYS_CLOSE: {}, + syscall.SYS_DUP: {}, + syscall.SYS_EPOLL_CTL: {}, syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_EVENTFD2: []seccomp.Rule{ { - seccomp.AllowValue(0), - seccomp.AllowValue(0), + seccomp.EqualTo(0), + seccomp.EqualTo(0), }, }, syscall.SYS_EXIT: {}, syscall.SYS_EXIT_GROUP: {}, syscall.SYS_FALLOCATE: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, }, syscall.SYS_FCHMOD: {}, syscall.SYS_FCHOWNAT: {}, syscall.SYS_FCNTL: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_SETFL), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_SETFL), }, { - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.F_GETFD), + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.F_GETFD), }, // Used by flipcall.PacketWindowAllocator.Init(). { - seccomp.AllowAny{}, - seccomp.AllowValue(unix.F_ADD_SEALS), + seccomp.MatchAny{}, + seccomp.EqualTo(unix.F_ADD_SEALS), }, }, syscall.SYS_FSTAT: {}, @@ -91,31 +80,31 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_FTRUNCATE: {}, syscall.SYS_FUTEX: { seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, // Non-private futex used for flipcall. seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAIT), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAIT), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, seccomp.Rule{ - seccomp.AllowAny{}, - seccomp.AllowValue(linux.FUTEX_WAKE), - seccomp.AllowAny{}, - seccomp.AllowAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(linux.FUTEX_WAKE), + seccomp.MatchAny{}, + seccomp.MatchAny{}, }, }, syscall.SYS_GETDENTS64: {}, @@ -137,28 +126,28 @@ var allowedSyscalls = seccomp.SyscallRules{ // TODO(b/148688965): Remove once this is gone from Go. syscall.SYS_MLOCK: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowValue(4096), + seccomp.MatchAny{}, + seccomp.EqualTo(4096), }, }, syscall.SYS_MMAP: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_SHARED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_SHARED), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED), }, }, syscall.SYS_MPROTECT: {}, @@ -172,14 +161,14 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_READLINKAT: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC), }, { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK), }, }, syscall.SYS_RENAMEAT: {}, @@ -190,33 +179,33 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SENDMSG: []seccomp.Rule{ // Used by fdchannel.Endpoint.SendFD(). { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(0), }, // Used by unet.SocketWriter.WriteVec(). { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), + seccomp.MatchAny{}, + seccomp.MatchAny{}, + seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL), }, }, syscall.SYS_SHUTDOWN: []seccomp.Rule{ - {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, + {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, // Used by fdchannel.NewConnectedSockets(). syscall.SYS_SOCKETPAIR: { { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_SEQPACKET | syscall.SOCK_CLOEXEC), + seccomp.EqualTo(0), }, }, syscall.SYS_SYMLINKAT: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { - seccomp.AllowValue(uint64(os.Getpid())), + seccomp.EqualTo(uint64(os.Getpid())), }, }, syscall.SYS_UNLINKAT: {}, @@ -227,24 +216,24 @@ var allowedSyscalls = seccomp.SyscallRules{ var udsSyscalls = seccomp.SyscallRules{ syscall.SYS_SOCKET: []seccomp.Rule{ { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_STREAM), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_STREAM), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_DGRAM), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_DGRAM), + seccomp.EqualTo(0), }, { - seccomp.AllowValue(syscall.AF_UNIX), - seccomp.AllowValue(syscall.SOCK_SEQPACKET), - seccomp.AllowValue(0), + seccomp.EqualTo(syscall.AF_UNIX), + seccomp.EqualTo(syscall.SOCK_SEQPACKET), + seccomp.EqualTo(0), }, }, syscall.SYS_CONNECT: []seccomp.Rule{ { - seccomp.AllowAny{}, + seccomp.MatchAny{}, }, }, } diff --git a/runsc/fsgofer/filter/config_amd64.go b/runsc/fsgofer/filter/config_amd64.go index a4b28cb8b..686753d96 100644 --- a/runsc/fsgofer/filter/config_amd64.go +++ b/runsc/fsgofer/filter/config_amd64.go @@ -25,8 +25,41 @@ import ( func init() { allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{ - {seccomp.AllowValue(linux.ARCH_GET_FS)}, - {seccomp.AllowValue(linux.ARCH_SET_FS)}, + // TODO(b/168828518): No longer used in Go 1.16+. + {seccomp.EqualTo(linux.ARCH_SET_FS)}, + } + + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SETTLS | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, + { + // TODO(b/168828518): No longer used in Go 1.16+ (on amd64). + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + seccomp.EqualTo(0), // parent_tidptr + seccomp.EqualTo(0), // child_tidptr + seccomp.MatchAny{}, // tls + }, } allowedSyscalls[syscall.SYS_NEWFSTATAT] = []seccomp.Rule{} diff --git a/runsc/fsgofer/filter/config_arm64.go b/runsc/fsgofer/filter/config_arm64.go index d2697deb7..ff0cf77a0 100644 --- a/runsc/fsgofer/filter/config_arm64.go +++ b/runsc/fsgofer/filter/config_arm64.go @@ -23,5 +23,26 @@ import ( ) func init() { + allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{ + // parent_tidptr and child_tidptr are always 0 because neither + // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used. + { + seccomp.EqualTo( + syscall.CLONE_VM | + syscall.CLONE_FS | + syscall.CLONE_FILES | + syscall.CLONE_SIGHAND | + syscall.CLONE_SYSVSEM | + syscall.CLONE_THREAD), + seccomp.MatchAny{}, // newsp + // These arguments are left uninitialized by the Go + // runtime, so they may be anything (and are unused by + // the host). + seccomp.MatchAny{}, // parent_tidptr + seccomp.MatchAny{}, // tls + seccomp.MatchAny{}, // child_tidptr + }, + } + allowedSyscalls[syscall.SYS_FSTATAT] = []seccomp.Rule{} } diff --git a/runsc/fsgofer/filter/extra_filters_race.go b/runsc/fsgofer/filter/extra_filters_race.go index 885c92f7a..20a0732be 100644 --- a/runsc/fsgofer/filter/extra_filters_race.go +++ b/runsc/fsgofer/filter/extra_filters_race.go @@ -35,6 +35,7 @@ func instrumentationFilters() seccomp.SyscallRules { syscall.SYS_MUNLOCK: {}, syscall.SYS_NANOSLEEP: {}, syscall.SYS_OPEN: {}, + syscall.SYS_OPENAT: {}, syscall.SYS_SET_ROBUST_LIST: {}, // Used within glibc's malloc. syscall.SYS_TIME: {}, diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index b0788bd23..0b628c8ce 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -181,6 +181,8 @@ func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID { // The few exceptions where it cannot be done are: utimensat on symlinks, and // Connect() for the socket address. type localFile struct { + p9.DisallowClientCalls + // attachPoint is the attachPoint that serves this localFile. attachPoint *attachPoint @@ -1179,9 +1181,6 @@ func extractErrno(err error) unix.Errno { func (l *localFile) checkROMount() error { if conf := l.attachPoint.conf; conf.ROMount { - if conf.PanicOnWrite { - panic("attempt to write to RO mount") - } return unix.EROFS } return nil diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go index 0e4945b3d..a84206686 100644 --- a/runsc/fsgofer/fsgofer_test.go +++ b/runsc/fsgofer/fsgofer_test.go @@ -553,29 +553,6 @@ func TestROMountChecks(t *testing.T) { }) } -func TestROMountPanics(t *testing.T) { - conf := Config{ROMount: true, PanicOnWrite: true} - uid := p9.UID(os.Getuid()) - gid := p9.GID(os.Getgid()) - - runCustom(t, allTypes, []Config{conf}, func(t *testing.T, s state) { - if s.fileType != unix.S_IFLNK { - assertPanic(t, func() { s.file.Open(p9.WriteOnly) }) - } - assertPanic(t, func() { s.file.Create("some_file", p9.ReadWrite, 0777, uid, gid) }) - assertPanic(t, func() { s.file.Mkdir("some_dir", 0777, uid, gid) }) - assertPanic(t, func() { s.file.RenameAt("some_file", s.file, "other_file") }) - assertPanic(t, func() { s.file.Symlink("some_place", "some_symlink", uid, gid) }) - assertPanic(t, func() { s.file.UnlinkAt("some_file", 0) }) - assertPanic(t, func() { s.file.Link(s.file, "some_link") }) - assertPanic(t, func() { s.file.Mknod("some-nod", 0777, 1, 2, uid, gid) }) - - valid := p9.SetAttrMask{Size: true} - attr := p9.SetAttr{Size: 0} - assertPanic(t, func() { s.file.SetAttr(valid, attr) }) - }) -} - func TestWalkNotFound(t *testing.T) { runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) { if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT { diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index a339937fb..a8f4f64a5 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -478,10 +478,10 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn cmd.Stderr = nil // If the console control socket file is provided, then create a new - // pty master/slave pair and set the TTY on the sandbox process. + // pty master/replica pair and set the TTY on the sandbox process. if args.Spec.Process.Terminal && args.ConsoleSocket != "" { // console.NewWithSocket will send the master on the given - // socket, and return the slave. + // socket, and return the replica. tty, err := console.NewWithSocket(args.ConsoleSocket) if err != nil { return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 43851a22f..679d8bc8e 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/bits", "//pkg/log", "//pkg/sentry/kernel/auth", + "//runsc/config", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_mohae_deepcopy//:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", diff --git a/runsc/specutils/seccomp/BUILD b/runsc/specutils/seccomp/BUILD new file mode 100644 index 000000000..3520f2d6d --- /dev/null +++ b/runsc/specutils/seccomp/BUILD @@ -0,0 +1,34 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "seccomp", + srcs = [ + "audit_amd64.go", + "audit_arm64.go", + "seccomp.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/abi/linux", + "//pkg/bpf", + "//pkg/log", + "//pkg/seccomp", + "//pkg/sentry/kernel", + "//pkg/sentry/syscalls/linux", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], +) + +go_test( + name = "seccomp_test", + size = "small", + srcs = ["seccomp_test.go"], + library = ":seccomp", + deps = [ + "//pkg/binary", + "//pkg/bpf", + "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", + ], +) diff --git a/runsc/specutils/seccomp/audit_amd64.go b/runsc/specutils/seccomp/audit_amd64.go new file mode 100644 index 000000000..417cf4a7a --- /dev/null +++ b/runsc/specutils/seccomp/audit_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package seccomp + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +const ( + nativeArchAuditNo = linux.AUDIT_ARCH_X86_64 +) diff --git a/runsc/specutils/seccomp/audit_arm64.go b/runsc/specutils/seccomp/audit_arm64.go new file mode 100644 index 000000000..b727ceff2 --- /dev/null +++ b/runsc/specutils/seccomp/audit_arm64.go @@ -0,0 +1,25 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package seccomp + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +const ( + nativeArchAuditNo = linux.AUDIT_ARCH_AARCH64 +) diff --git a/runsc/specutils/seccomp/seccomp.go b/runsc/specutils/seccomp/seccomp.go new file mode 100644 index 000000000..5932f7a41 --- /dev/null +++ b/runsc/specutils/seccomp/seccomp.go @@ -0,0 +1,229 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package seccomp implements some features of libseccomp in order to support +// OCI. +package seccomp + +import ( + "fmt" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" +) + +var ( + killThreadAction = linux.SECCOMP_RET_KILL_THREAD + trapAction = linux.SECCOMP_RET_TRAP + // runc always returns EPERM as the errorcode for SECCOMP_RET_ERRNO + errnoAction = linux.SECCOMP_RET_ERRNO.WithReturnCode(uint16(syscall.EPERM)) + // runc always returns EPERM as the errorcode for SECCOMP_RET_TRACE + traceAction = linux.SECCOMP_RET_TRACE.WithReturnCode(uint16(syscall.EPERM)) + allowAction = linux.SECCOMP_RET_ALLOW +) + +// BuildProgram generates a bpf program based on the given OCI seccomp +// config. +func BuildProgram(s *specs.LinuxSeccomp) (bpf.Program, error) { + defaultAction, err := convertAction(s.DefaultAction) + if err != nil { + return bpf.Program{}, fmt.Errorf("secomp default action: %w", err) + } + ruleset, err := convertRules(s) + if err != nil { + return bpf.Program{}, fmt.Errorf("invalid seccomp rules: %w", err) + } + + instrs, err := seccomp.BuildProgram(ruleset, defaultAction, killThreadAction) + if err != nil { + return bpf.Program{}, fmt.Errorf("building seccomp program: %w", err) + } + + program, err := bpf.Compile(instrs) + if err != nil { + return bpf.Program{}, fmt.Errorf("compiling seccomp program: %w", err) + } + + return program, nil +} + +// lookupSyscallNo gets the syscall number for the syscall with the given name +// for the given architecture. +func lookupSyscallNo(arch uint32, name string) (uint32, error) { + var table *kernel.SyscallTable + switch arch { + case linux.AUDIT_ARCH_X86_64: + table = slinux.AMD64 + case linux.AUDIT_ARCH_AARCH64: + table = slinux.ARM64 + } + if table == nil { + return 0, fmt.Errorf("unsupported architecture: %d", arch) + } + n, err := table.LookupNo(name) + if err != nil { + return 0, err + } + return uint32(n), nil +} + +// convertAction converts a LinuxSeccompAction to BPFAction +func convertAction(act specs.LinuxSeccompAction) (linux.BPFAction, error) { + // TODO(gvisor.dev/issue/3124): Update specs package to include ActLog and ActKillProcess. + switch act { + case specs.ActKill: + return killThreadAction, nil + case specs.ActTrap: + return trapAction, nil + case specs.ActErrno: + return errnoAction, nil + case specs.ActTrace: + return traceAction, nil + case specs.ActAllow: + return allowAction, nil + default: + return 0, fmt.Errorf("invalid action: %v", act) + } +} + +// convertRules converts OCI linux seccomp rules into RuleSets that can be used by +// the seccomp package to build a seccomp program. +func convertRules(s *specs.LinuxSeccomp) ([]seccomp.RuleSet, error) { + // NOTE: Architectures are only really relevant when calling 32bit syscalls + // on a 64bit system. Since we don't support that in gVisor anyway, we + // ignore Architectures and only test against the native architecture. + + ruleset := []seccomp.RuleSet{} + + for _, syscall := range s.Syscalls { + sysRules := seccomp.NewSyscallRules() + + action, err := convertAction(syscall.Action) + if err != nil { + return nil, err + } + + // Args + rules, err := convertArgs(syscall.Args) + if err != nil { + return nil, err + } + + for _, name := range syscall.Names { + syscallNo, err := lookupSyscallNo(nativeArchAuditNo, name) + if err != nil { + // If there is an error looking up the syscall number, assume it is + // not supported on this architecture and ignore it. This is, for + // better or worse, what runc does. + log.Warningf("OCI seccomp: ignoring syscall %q", name) + continue + } + + for _, rule := range rules { + sysRules.AddRule(uintptr(syscallNo), rule) + } + } + + ruleset = append(ruleset, seccomp.RuleSet{ + Rules: sysRules, + Action: action, + }) + } + + return ruleset, nil +} + +// convertArgs converts an OCI seccomp argument rule to a list of seccomp.Rule. +func convertArgs(args []specs.LinuxSeccompArg) ([]seccomp.Rule, error) { + argCounts := make([]uint, 6) + + for _, arg := range args { + if arg.Index > 6 { + return nil, fmt.Errorf("invalid index: %d", arg.Index) + } + + argCounts[arg.Index]++ + } + + // NOTE: If multiple rules apply to the same argument (same index) the + // action is triggered if any one of the rules matches (OR). If not, then + // all rules much match in order to trigger the action (AND). This appears to + // be some kind of legacy behavior of runc that nevertheless needs to be + // supported to maintain compatibility. + + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + rules := []seccomp.Rule{} + + // Old runc behavior - do this for compatibility. + // Add rules as ORs by adding separate Rules. + for _, arg := range args { + rule := seccomp.Rule{nil, nil, nil, nil, nil, nil} + + if err := convertRule(arg, &rule); err != nil { + return nil, err + } + + rules = append(rules, rule) + } + + return rules, nil + } + + // Add rules as ANDs by adding to the same Rule. + rule := seccomp.Rule{nil, nil, nil, nil, nil, nil} + for _, arg := range args { + if err := convertRule(arg, &rule); err != nil { + return nil, err + } + } + + return []seccomp.Rule{rule}, nil +} + +// convertRule converts and adds the arg to a rule. +func convertRule(arg specs.LinuxSeccompArg, rule *seccomp.Rule) error { + switch arg.Op { + case specs.OpEqualTo: + rule[arg.Index] = seccomp.EqualTo(arg.Value) + case specs.OpNotEqual: + rule[arg.Index] = seccomp.NotEqual(arg.Value) + case specs.OpGreaterThan: + rule[arg.Index] = seccomp.GreaterThan(arg.Value) + case specs.OpGreaterEqual: + rule[arg.Index] = seccomp.GreaterThanOrEqual(arg.Value) + case specs.OpLessThan: + rule[arg.Index] = seccomp.LessThan(arg.Value) + case specs.OpLessEqual: + rule[arg.Index] = seccomp.LessThanOrEqual(arg.Value) + case specs.OpMaskedEqual: + rule[arg.Index] = seccomp.MaskedEqual(uintptr(arg.Value), uintptr(arg.ValueTwo)) + default: + return fmt.Errorf("unsupported operand: %q", arg.Op) + } + return nil +} diff --git a/runsc/specutils/seccomp/seccomp_test.go b/runsc/specutils/seccomp/seccomp_test.go new file mode 100644 index 000000000..850c237ba --- /dev/null +++ b/runsc/specutils/seccomp/seccomp_test.go @@ -0,0 +1,414 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seccomp + +import ( + "fmt" + "syscall" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/bpf" +) + +type seccompData struct { + nr uint32 + arch uint32 + instructionPointer uint64 + args [6]uint64 +} + +// asInput converts a seccompData to a bpf.Input. +func asInput(d seccompData) bpf.Input { + return bpf.InputBytes{binary.Marshal(nil, binary.LittleEndian, d), binary.LittleEndian} +} + +// testInput creates an Input struct with given seccomp input values. +func testInput(arch uint32, syscallName string, args *[6]uint64) bpf.Input { + syscallNo, err := lookupSyscallNo(arch, syscallName) + if err != nil { + // Assume tests set valid syscall names. + panic(err) + } + + if args == nil { + argArray := [6]uint64{0, 0, 0, 0, 0, 0} + args = &argArray + } + + data := seccompData{ + nr: syscallNo, + arch: arch, + args: *args, + } + + return asInput(data) +} + +// testCase holds a seccomp test case. +type testCase struct { + name string + config specs.LinuxSeccomp + input bpf.Input + expected uint32 +} + +var ( + // seccompTests is a list of speccomp test cases. + seccompTests = []testCase{ + { + name: "default_allow", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + }, + input: testInput(nativeArchAuditNo, "read", nil), + expected: uint32(allowAction), + }, + { + name: "default_deny", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + }, + input: testInput(nativeArchAuditNo, "read", nil), + expected: uint32(errnoAction), + }, + { + name: "deny_arch", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + }, + Action: specs.ActErrno, + }, + }, + }, + // Syscall matches but the arch is AUDIT_ARCH_X86 so the return + // value is the bad arch action. + input: asInput(seccompData{nr: 183, arch: 0x40000003}), // + expected: uint32(killThreadAction), + }, + { + name: "match_name_errno", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + "chmod", + }, + Action: specs.ActErrno, + }, + { + Names: []string{ + "write", + }, + Action: specs.ActTrace, + }, + }, + }, + input: testInput(nativeArchAuditNo, "getcwd", nil), + expected: uint32(errnoAction), + }, + { + name: "match_name_trace", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + "chmod", + }, + Action: specs.ActErrno, + }, + { + Names: []string{ + "write", + }, + Action: specs.ActTrace, + }, + }, + }, + input: testInput(nativeArchAuditNo, "write", nil), + expected: uint32(traceAction), + }, + { + name: "no_match_name_allow", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getcwd", + "chmod", + }, + Action: specs.ActErrno, + }, + { + Names: []string{ + "write", + }, + Action: specs.ActTrace, + }, + }, + }, + input: testInput(nativeArchAuditNo, "openat", nil), + expected: uint32(allowAction), + }, + { + name: "simple_match_args", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}), + expected: uint32(errnoAction), + }, + { + name: "match_args_or", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + Op: specs.OpEqualTo, + }, + { + Index: 0, + Value: syscall.CLONE_VM, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}), + expected: uint32(errnoAction), + }, + { + name: "match_args_and", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getsockopt", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 1, + Value: syscall.SOL_SOCKET, + Op: specs.OpEqualTo, + }, + { + Index: 2, + Value: syscall.SO_PEERCRED, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET, syscall.SO_PEERCRED}), + expected: uint32(errnoAction), + }, + { + name: "no_match_args_and", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "getsockopt", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 1, + Value: syscall.SOL_SOCKET, + Op: specs.OpEqualTo, + }, + { + Index: 2, + Value: syscall.SO_PEERCRED, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "getsockopt", &[6]uint64{0, syscall.SOL_SOCKET}), + expected: uint32(allowAction), + }, + { + name: "Simple args (no match)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + Op: specs.OpEqualTo, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_VM}), + expected: uint32(allowAction), + }, + { + name: "OpMaskedEqual (match)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS, + ValueTwo: syscall.CLONE_FS, + Op: specs.OpMaskedEqual, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS | syscall.CLONE_VM}), + expected: uint32(errnoAction), + }, + { + name: "OpMaskedEqual (no match)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActAllow, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: syscall.CLONE_FS | syscall.CLONE_VM, + ValueTwo: syscall.CLONE_FS | syscall.CLONE_VM, + Op: specs.OpMaskedEqual, + }, + }, + Action: specs.ActErrno, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{syscall.CLONE_FS}), + expected: uint32(allowAction), + }, + { + name: "OpMaskedEqual (clone)", + config: specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{ + "clone", + }, + // This comes from the Docker default seccomp + // profile for clone. + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: 0x7e020000, + ValueTwo: 0x0, + Op: specs.OpMaskedEqual, + }, + }, + Action: specs.ActAllow, + }, + }, + }, + input: testInput(nativeArchAuditNo, "clone", &[6]uint64{0x50f00}), + expected: uint32(allowAction), + }, + } +) + +// TestRunscSeccomp generates seccomp programs from OCI config and executes +// them using runsc's library, comparing against expected results. +func TestRunscSeccomp(t *testing.T) { + for _, tc := range seccompTests { + t.Run(tc.name, func(t *testing.T) { + runscProgram, err := BuildProgram(&tc.config) + if err != nil { + t.Fatalf("generating runsc BPF: %v", err) + } + + if err := checkProgram(runscProgram, tc.input, tc.expected); err != nil { + t.Fatalf("running runsc BPF: %v", err) + } + }) + } +} + +// checkProgram runs the given program over the given input and checks the +// result against the expected output. +func checkProgram(p bpf.Program, in bpf.Input, expected uint32) error { + result, err := bpf.Exec(p, in) + if err != nil { + return err + } + + if result != expected { + // Include a decoded version of the program in output for debugging purposes. + decoded, _ := bpf.DecodeProgram(p) + return fmt.Errorf("Unexpected result: got: %d, expected: %d\nBPF Program\n%s", result, expected, decoded) + } + + return nil +} diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 5015c3a84..0392e3e83 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/runsc/config" ) // ExePath must point to runsc binary, which is normally the same binary. It's @@ -110,11 +111,6 @@ func ValidateSpec(spec *specs.Spec) error { log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") } - // TODO(gvisor.dev/issue/510): Apply seccomp to application inside sandbox. - if spec.Linux != nil && spec.Linux.Seccomp != nil { - log.Warningf("Seccomp spec is being ignored") - } - if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { return err @@ -161,18 +157,18 @@ func OpenSpec(bundleDir string) (*os.File, error) { // ReadSpec reads an OCI runtime spec from the given bundle directory. // ReadSpec also normalizes all potential relative paths into absolute // path, e.g. spec.Root.Path, mount.Source. -func ReadSpec(bundleDir string) (*specs.Spec, error) { +func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { specFile, err := OpenSpec(bundleDir) if err != nil { return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) } defer specFile.Close() - return ReadSpecFromFile(bundleDir, specFile) + return ReadSpecFromFile(bundleDir, specFile, conf) } // ReadSpecFromFile reads an OCI runtime spec from the given File, and // normalizes all relative paths into absolute by prepending the bundle dir. -func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) { +func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { if _, err := specFile.Seek(0, os.SEEK_SET); err != nil { return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) } @@ -195,6 +191,20 @@ func ReadSpecFromFile(bundleDir string, specFile *os.File) (*specs.Spec, error) m.Source = absPath(bundleDir, m.Source) } } + + // Override flags using annotation to allow customization per sandbox + // instance. + for annotation, val := range spec.Annotations { + const flagPrefix = "dev.gvisor.flag." + if strings.HasPrefix(annotation, flagPrefix) { + name := annotation[len(flagPrefix):] + log.Infof("Overriding flag: %s=%q", name, val) + if err := conf.Override(name, val); err != nil { + return nil, err + } + } + } + return &spec, nil } |