summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/BUILD5
-rw-r--r--runsc/boot/controller.go12
-rw-r--r--runsc/boot/filter/config.go491
-rw-r--r--runsc/boot/filter/config_amd64.go41
-rw-r--r--runsc/boot/filter/config_arm64.go25
-rw-r--r--runsc/boot/filter/config_profile.go6
-rw-r--r--runsc/boot/fs.go27
-rw-r--r--runsc/boot/loader.go151
-rw-r--r--runsc/boot/loader_test.go13
-rw-r--r--runsc/boot/vfs.go189
10 files changed, 584 insertions, 376 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 040f6a72d..2d9517f4a 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -26,10 +26,13 @@ go_library(
deps = [
"//pkg/abi",
"//pkg/abi/linux",
+ "//pkg/bpf",
+ "//pkg/cleanup",
"//pkg/context",
"//pkg/control/server",
"//pkg/cpuid",
"//pkg/eventchannel",
+ "//pkg/fd",
"//pkg/fspath",
"//pkg/log",
"//pkg/memutil",
@@ -106,6 +109,7 @@ go_library(
"//runsc/boot/pprof",
"//runsc/config",
"//runsc/specutils",
+ "//runsc/specutils/seccomp",
"@com_github_golang_protobuf//proto:go_default_library",
"@com_github_opencontainers_runtime_spec//specs-go:go_default_library",
"@org_golang_x_sys//unix:go_default_library",
@@ -123,6 +127,7 @@ go_test(
library = ":boot",
deps = [
"//pkg/control/server",
+ "//pkg/fd",
"//pkg/fspath",
"//pkg/log",
"//pkg/p9",
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 68a2b45cf..894651519 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -22,6 +22,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -257,13 +258,20 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
// All validation passed, logs the spec for debugging.
specutils.LogSpec(args.Spec)
- err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ fds, err := fd.NewFromFiles(args.FilePayload.Files)
if err != nil {
+ return err
+ }
+ defer func() {
+ for _, fd := range fds {
+ _ = fd.Close()
+ }
+ }()
+ if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
return err
}
log.Debugf("Container %q started", args.CID)
-
return nil
}
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 149eb0b1b..6ac19668f 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -27,41 +27,30 @@ import (
// allowedSyscalls is the set of syscalls executed by the Sentry to the host OS.
var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_CLOCK_GETTIME: {},
- syscall.SYS_CLONE: []seccomp.Rule{
- {
- seccomp.AllowValue(
- syscall.CLONE_VM |
- syscall.CLONE_FS |
- syscall.CLONE_FILES |
- syscall.CLONE_SIGHAND |
- syscall.CLONE_SYSVSEM |
- syscall.CLONE_THREAD),
- },
- },
- syscall.SYS_CLOSE: {},
- syscall.SYS_DUP: {},
+ syscall.SYS_CLOSE: {},
+ syscall.SYS_DUP: {},
syscall.SYS_DUP3: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.O_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.O_CLOEXEC),
},
},
syscall.SYS_EPOLL_CREATE1: {},
syscall.SYS_EPOLL_CTL: {},
syscall.SYS_EPOLL_PWAIT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EVENTFD2: []seccomp.Rule{
{
- seccomp.AllowValue(0),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(0),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_EXIT: {},
@@ -70,16 +59,16 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FCHMOD: {},
syscall.SYS_FCNTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_SETFL),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_SETFL),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.F_GETFD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.F_GETFD),
},
},
syscall.SYS_FSTAT: {},
@@ -87,52 +76,52 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_FTRUNCATE: {},
syscall.SYS_FUTEX: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
+ seccomp.MatchAny{},
},
// Non-private variants are included for flipcall support. They are otherwise
// unncessary, as the sentry will use only private futexes internally.
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAIT),
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAIT),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(linux.FUTEX_WAKE),
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(linux.FUTEX_WAKE),
+ seccomp.MatchAny{},
},
},
syscall.SYS_GETPID: {},
unix.SYS_GETRANDOM: {},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_DOMAIN),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_DOMAIN),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_TYPE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TYPE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_ERROR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_ERROR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
},
},
syscall.SYS_GETTID: {},
@@ -141,34 +130,34 @@ var allowedSyscalls = seccomp.SyscallRules{
// setting/getting termios and winsize.
syscall.SYS_IOCTL: []seccomp.Rule{
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCGETS),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCGETS),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETS),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETS),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETSF),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETSF),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TCSETSW),
- seccomp.AllowAny{}, /* termios struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TCSETSW),
+ seccomp.MatchAny{}, /* termios struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TIOCSWINSZ),
- seccomp.AllowAny{}, /* winsize struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TIOCSWINSZ),
+ seccomp.MatchAny{}, /* winsize struct */
},
{
- seccomp.AllowAny{}, /* fd */
- seccomp.AllowValue(linux.TIOCGWINSZ),
- seccomp.AllowAny{}, /* winsize struct */
+ seccomp.MatchAny{}, /* fd */
+ seccomp.EqualTo(linux.TIOCGWINSZ),
+ seccomp.MatchAny{}, /* winsize struct */
},
},
syscall.SYS_LSEEK: {},
@@ -182,46 +171,46 @@ var allowedSyscalls = seccomp.SyscallRules{
// TODO(b/148688965): Remove once this is gone from Go.
syscall.SYS_MLOCK: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(4096),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4096),
},
},
syscall.SYS_MMAP: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_SHARED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_SHARED),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_STACK),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_NORESERVE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.PROT_WRITE | syscall.PROT_READ),
- seccomp.AllowValue(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.PROT_WRITE | syscall.PROT_READ),
+ seccomp.EqualTo(syscall.MAP_PRIVATE | syscall.MAP_ANONYMOUS | syscall.MAP_FIXED),
},
},
syscall.SYS_MPROTECT: {},
@@ -237,32 +226,32 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_READ: {},
syscall.SYS_RECVMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC),
},
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC | syscall.MSG_PEEK),
},
},
syscall.SYS_RECVMMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(fdbased.MaxMsgsPerRecv),
- seccomp.AllowValue(syscall.MSG_DONTWAIT),
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(fdbased.MaxMsgsPerRecv),
+ seccomp.EqualTo(syscall.MSG_DONTWAIT),
+ seccomp.EqualTo(0),
},
},
unix.SYS_SENDMMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT),
- seccomp.AllowValue(0),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_RESTART_SYSCALL: {},
@@ -272,49 +261,49 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_SCHED_YIELD: {},
syscall.SYS_SENDMSG: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.MSG_DONTWAIT | syscall.MSG_NOSIGNAL),
},
},
syscall.SYS_SETITIMER: {},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
// Used by fs/host to shutdown host sockets.
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RD)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_WR)},
// Used by unet to shutdown connections.
- {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
+ {seccomp.MatchAny{}, seccomp.EqualTo(syscall.SHUT_RDWR)},
},
syscall.SYS_SIGALTSTACK: {},
unix.SYS_STATX: {},
syscall.SYS_SYNC_FILE_RANGE: {},
syscall.SYS_TEE: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(1), /* len */
- seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(1), /* len */
+ seccomp.EqualTo(unix.SPLICE_F_NONBLOCK), /* flags */
},
},
syscall.SYS_TGKILL: []seccomp.Rule{
{
- seccomp.AllowValue(uint64(os.Getpid())),
+ seccomp.EqualTo(uint64(os.Getpid())),
},
},
syscall.SYS_UTIMENSAT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(0), /* null pathname */
- seccomp.AllowAny{},
- seccomp.AllowValue(0), /* flags */
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0), /* null pathname */
+ seccomp.MatchAny{},
+ seccomp.EqualTo(0), /* flags */
},
},
syscall.SYS_WRITE: {},
// For rawfile.NonBlockingWriteIovec.
syscall.SYS_WRITEV: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
seccomp.GreaterThan(0),
},
},
@@ -325,10 +314,10 @@ func hostInetFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT4: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
},
},
syscall.SYS_BIND: {},
@@ -337,84 +326,84 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_GETSOCKNAME: {},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_TOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_TOS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_RECVTOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVTOS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_TCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_TCLASS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_V6ONLY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_V6ONLY),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_ERROR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_ERROR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_KEEPALIVE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_KEEPALIVE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_RCVBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_RCVBUF),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_REUSEADDR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_TYPE),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_TYPE),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_LINGER),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_LINGER),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_NODELAY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_NODELAY),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_INFO),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_INFO),
},
},
syscall.SYS_IOCTL: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.TIOCOUTQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.TIOCOUTQ),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.TIOCINQ),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.TIOCINQ),
},
},
syscall.SYS_LISTEN: {},
@@ -425,103 +414,103 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_SENDTO: {},
syscall.SYS_SETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_V6ONLY),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_V6ONLY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_SNDBUF),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_SNDBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_RCVBUF),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_RCVBUF),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_REUSEADDR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_TCP),
- seccomp.AllowValue(syscall.TCP_NODELAY),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_TCP),
+ seccomp.EqualTo(syscall.TCP_NODELAY),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_TOS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_TOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IP),
- seccomp.AllowValue(syscall.IP_RECVTOS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IP),
+ seccomp.EqualTo(syscall.IP_RECVTOS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_TCLASS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_TCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_IPV6),
- seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
- seccomp.AllowAny{},
- seccomp.AllowValue(4),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_IPV6),
+ seccomp.EqualTo(syscall.IPV6_RECVTCLASS),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(4),
},
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_RD),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_RD),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_WR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_WR),
},
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SHUT_RDWR),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SHUT_RDWR),
},
},
syscall.SYS_SOCKET: []seccomp.Rule{
{
- seccomp.AllowValue(syscall.AF_INET),
- seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET),
+ seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET),
- seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET),
+ seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET6),
- seccomp.AllowValue(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET6),
+ seccomp.EqualTo(syscall.SOCK_STREAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
{
- seccomp.AllowValue(syscall.AF_INET6),
- seccomp.AllowValue(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
- seccomp.AllowValue(0),
+ seccomp.EqualTo(syscall.AF_INET6),
+ seccomp.EqualTo(syscall.SOCK_DGRAM | syscall.SOCK_NONBLOCK | syscall.SOCK_CLOEXEC),
+ seccomp.EqualTo(0),
},
},
syscall.SYS_WRITEV: {},
@@ -532,20 +521,20 @@ func controlServerFilters(fd int) seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_ACCEPT: []seccomp.Rule{
{
- seccomp.AllowValue(fd),
+ seccomp.EqualTo(fd),
},
},
syscall.SYS_LISTEN: []seccomp.Rule{
{
- seccomp.AllowValue(fd),
- seccomp.AllowValue(16 /* unet.backlog */),
+ seccomp.EqualTo(fd),
+ seccomp.EqualTo(16 /* unet.backlog */),
},
},
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_PEERCRED),
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.SOL_SOCKET),
+ seccomp.EqualTo(syscall.SO_PEERCRED),
},
},
}
diff --git a/runsc/boot/filter/config_amd64.go b/runsc/boot/filter/config_amd64.go
index 5335ff82c..cea5613b8 100644
--- a/runsc/boot/filter/config_amd64.go
+++ b/runsc/boot/filter/config_amd64.go
@@ -24,8 +24,41 @@ import (
)
func init() {
- allowedSyscalls[syscall.SYS_ARCH_PRCTL] = append(allowedSyscalls[syscall.SYS_ARCH_PRCTL],
- seccomp.Rule{seccomp.AllowValue(linux.ARCH_GET_FS)},
- seccomp.Rule{seccomp.AllowValue(linux.ARCH_SET_FS)},
- )
+ allowedSyscalls[syscall.SYS_ARCH_PRCTL] = []seccomp.Rule{
+ // TODO(b/168828518): No longer used in Go 1.16+.
+ {seccomp.EqualTo(linux.ARCH_SET_FS)},
+ }
+
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ // parent_tidptr and child_tidptr are always 0 because neither
+ // CLONE_PARENT_SETTID nor CLONE_CHILD_SETTID are used.
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SETTLS |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ {
+ // TODO(b/168828518): No longer used in Go 1.16+ (on amd64).
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ seccomp.EqualTo(0), // parent_tidptr
+ seccomp.EqualTo(0), // child_tidptr
+ seccomp.MatchAny{}, // tls
+ },
+ }
}
diff --git a/runsc/boot/filter/config_arm64.go b/runsc/boot/filter/config_arm64.go
index 7fa9bbda3..37313f97f 100644
--- a/runsc/boot/filter/config_arm64.go
+++ b/runsc/boot/filter/config_arm64.go
@@ -16,6 +16,29 @@
package filter
-// Reserve for future customization.
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
func init() {
+ allowedSyscalls[syscall.SYS_CLONE] = []seccomp.Rule{
+ {
+ seccomp.EqualTo(
+ syscall.CLONE_VM |
+ syscall.CLONE_FS |
+ syscall.CLONE_FILES |
+ syscall.CLONE_SIGHAND |
+ syscall.CLONE_SYSVSEM |
+ syscall.CLONE_THREAD),
+ seccomp.MatchAny{}, // newsp
+ // These arguments are left uninitialized by the Go
+ // runtime, so they may be anything (and are unused by
+ // the host).
+ seccomp.MatchAny{}, // parent_tidptr
+ seccomp.MatchAny{}, // tls
+ seccomp.MatchAny{}, // child_tidptr
+ },
+ }
}
diff --git a/runsc/boot/filter/config_profile.go b/runsc/boot/filter/config_profile.go
index 194952a7b..7b8669595 100644
--- a/runsc/boot/filter/config_profile.go
+++ b/runsc/boot/filter/config_profile.go
@@ -25,9 +25,9 @@ func profileFilters() seccomp.SyscallRules {
return seccomp.SyscallRules{
syscall.SYS_OPENAT: []seccomp.Rule{
{
- seccomp.AllowAny{},
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.EqualTo(syscall.O_RDONLY | syscall.O_LARGEFILE | syscall.O_CLOEXEC),
},
},
}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 163265afe..ddf288456 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -34,6 +34,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
@@ -253,7 +254,7 @@ func mustFindFilesystem(name string) fs.Filesystem {
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
// paths.
-func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
// Construct a ramfs tree of mount points. The contents never
// change, so this can be fully caching. There's no real
// filesystem backing this tree, so we set the filesystem to
@@ -263,7 +264,7 @@ func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string
if err != nil {
return nil, fmt.Errorf("creating mount tree: %v", err)
}
- overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
if err != nil {
return nil, fmt.Errorf("adding mount overlay: %v", err)
}
@@ -320,14 +321,14 @@ func adjustDirentCache(k *kernel.Kernel) error {
}
type fdDispenser struct {
- fds []int
+ fds []*fd.FD
}
func (f *fdDispenser) remove() int {
if f.empty() {
panic("fdDispenser out of fds")
}
- rv := f.fds[0]
+ rv := f.fds[0].Release()
f.fds = f.fds[1:]
return rv
}
@@ -453,17 +454,17 @@ func (m *mountHint) isSupported() bool {
func (m *mountHint) checkCompatible(mount specs.Mount) error {
// Remove options that don't affect to mount's behavior.
masterOpts := filterUnsupportedOptions(m.mount)
- slaveOpts := filterUnsupportedOptions(mount)
+ replicaOpts := filterUnsupportedOptions(mount)
- if len(masterOpts) != len(slaveOpts) {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ if len(masterOpts) != len(replicaOpts) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
}
sort.Strings(masterOpts)
- sort.Strings(slaveOpts)
+ sort.Strings(replicaOpts)
for i, opt := range masterOpts {
- if opt != slaveOpts[i] {
- return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+ if opt != replicaOpts[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
}
}
return nil
@@ -564,7 +565,7 @@ type containerMounter struct {
hints *podMountHints
}
-func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter {
return &containerMounter{
root: spec.Root,
mounts: compileMounts(spec),
@@ -740,7 +741,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Con
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
// mounted even if they are not in the spec.
submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
- rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
if err != nil {
return nil, fmt.Errorf("adding submount overlay: %v", err)
}
@@ -850,7 +851,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Confi
submounts := subtargets(m.Destination, c.mounts)
if len(submounts) > 0 {
log.Infof("Adding submount overlay over %q", m.Destination)
- inode, err = addSubmountOverlay(ctx, inode, submounts)
+ inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
if err != nil {
return fmt.Errorf("adding submount overlay: %v", err)
}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index c3c754046..2e652ddad 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -27,8 +27,10 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
@@ -69,6 +71,7 @@ import (
"gvisor.dev/gvisor/runsc/boot/pprof"
"gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/specutils"
+ "gvisor.dev/gvisor/runsc/specutils/seccomp"
// Include supported socket providers.
"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
@@ -89,10 +92,10 @@ type containerInfo struct {
procArgs kernel.CreateProcessArgs
// stdioFDs contains stdin, stdout, and stderr.
- stdioFDs []int
+ stdioFDs []*fd.FD
// goferFDs are the FDs that attach the sandbox to the gofers.
- goferFDs []int
+ goferFDs []*fd.FD
}
// Loader keeps state needed to start the kernel and run the container..
@@ -356,12 +359,17 @@ func New(args Args) (*Loader, error) {
k.SetHostMount(hostMount)
}
+ info := containerInfo{
+ conf: args.Conf,
+ spec: args.Spec,
+ procArgs: procArgs,
+ }
+
// Make host FDs stable between invocations. Host FDs must map to the exact
// same number when the sandbox is restored. Otherwise the wrong FD will be
// used.
- var stdioFDs []int
newfd := startingStdioFD
- for _, fd := range args.StdioFDs {
+ for _, stdioFD := range args.StdioFDs {
// Check that newfd is unused to avoid clobbering over it.
if _, err := unix.FcntlInt(uintptr(newfd), unix.F_GETFD, 0); !errors.Is(err, unix.EBADF) {
if err != nil {
@@ -370,14 +378,17 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("unable to remap stdios, FD %d is already in use", newfd)
}
- err := unix.Dup3(fd, newfd, unix.O_CLOEXEC)
+ err := unix.Dup3(stdioFD, newfd, unix.O_CLOEXEC)
if err != nil {
- return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
+ return nil, fmt.Errorf("dup3 of stdios failed: %w", err)
}
- stdioFDs = append(stdioFDs, newfd)
- _ = unix.Close(fd)
+ info.stdioFDs = append(info.stdioFDs, fd.New(newfd))
+ _ = unix.Close(stdioFD)
newfd++
}
+ for _, goferFD := range args.GoferFDs {
+ info.goferFDs = append(info.goferFDs, fd.New(goferFD))
+ }
eid := execID{cid: args.ID}
l := &Loader{
@@ -386,13 +397,7 @@ func New(args Args) (*Loader, error) {
sandboxID: args.ID,
processes: map[execID]*execProcess{eid: {}},
mountHints: mountHints,
- root: containerInfo{
- conf: args.Conf,
- stdioFDs: stdioFDs,
- goferFDs: args.GoferFDs,
- spec: args.Spec,
- procArgs: procArgs,
- },
+ root: info,
}
// We don't care about child signals; some platforms can generate a
@@ -466,9 +471,14 @@ func (l *Loader) Destroy() {
}
l.watchdog.Stop()
- for i, fd := range l.root.stdioFDs {
- _ = unix.Close(fd)
- l.root.stdioFDs[i] = -1
+ // In the success case, stdioFDs and goferFDs will only contain
+ // released/closed FDs that ownership has been passed over to host FDs and
+ // gofer sessions. Close them here in case on failure.
+ for _, fd := range l.root.stdioFDs {
+ _ = fd.Close()
+ }
+ for _, fd := range l.root.goferFDs {
+ _ = fd.Close()
}
}
@@ -499,6 +509,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return mf, nil
}
+// installSeccompFilters installs sandbox seccomp filters with the host.
func (l *Loader) installSeccompFilters() error {
if l.root.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
@@ -569,6 +580,7 @@ func (l *Loader) run() error {
if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
return err
}
+
}
ep.tg = l.k.GlobalInit()
@@ -598,17 +610,6 @@ func (l *Loader) run() error {
}
})
- // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
- // either in createFDTable() during initial start or in descriptor.initAfterLoad()
- // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
- // passed FDs, so only close for VFS1.
- if !kernel.VFS2Enabled {
- for i, fd := range l.root.stdioFDs {
- _ = unix.Close(fd)
- l.root.stdioFDs[i] = -1
- }
- }
-
log.Infof("Process should have started...")
l.watchdog.Start()
return l.k.Start()
@@ -628,9 +629,9 @@ func (l *Loader) createContainer(cid string) error {
}
// startContainer starts a child container. It returns the thread group ID of
-// the newly created process. Caller owns 'files' and may close them after
-// this method returns.
-func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*os.File) error {
+// the newly created process. Used FDs are either closed or released. It's safe
+// for the caller to close any remaining files upon return.
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -681,28 +682,15 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
}
info := &containerInfo{
- conf: conf,
- spec: spec,
+ conf: conf,
+ spec: spec,
+ stdioFDs: files[:3],
+ goferFDs: files[3:],
}
info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
-
- // setupContainerFS() dups stdioFDs, so we don't need to dup them here.
- for _, f := range files[:3] {
- info.stdioFDs = append(info.stdioFDs, int(f.Fd()))
- }
-
- // Can't take ownership away from os.File. dup them to get a new FDs.
- for _, f := range files[3:] {
- fd, err := unix.Dup(int(f.Fd()))
- if err != nil {
- return fmt.Errorf("failed to dup file: %v", err)
- }
- info.goferFDs = append(info.goferFDs, fd)
- }
-
tg, err := l.createContainerProcess(false, cid, info, ep)
if err != nil {
return err
@@ -780,19 +768,44 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
}
}
+ // Install seccomp filters with the new task if there are any.
+ if info.conf.OCISeccomp {
+ if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+ program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
+ if err != nil {
+ return nil, fmt.Errorf("building seccomp program: %v", err)
+ }
+
+ if log.IsLogging(log.Debug) {
+ out, _ := bpf.DecodeProgram(program)
+ log.Debugf("Installing OCI seccomp filters\nProgram:\n%s", out)
+ }
+
+ task := tg.Leader()
+ // NOTE: It seems Flags are ignored by runc so we ignore them too.
+ if err := task.AppendSyscallFilter(program, true); err != nil {
+ return nil, fmt.Errorf("appending seccomp filters: %v", err)
+ }
+ }
+ } else {
+ if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
+ log.Warningf("Seccomp spec is being ignored")
+ }
+ }
+
return tg, nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
// the gofer FDs looking for disconnects, and destroys the container if a
// disconnect occurs in any of the gofer FDs.
-func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
go func() {
log.Debugf("Monitoring gofer health for container %q", cid)
var events []unix.PollFd
- for _, fd := range goferFDs {
+ for _, goferFD := range goferFDs {
events = append(events, unix.PollFd{
- Fd: int32(fd),
+ Fd: int32(goferFD.FD()),
Events: unix.POLLHUP | unix.POLLRDHUP,
})
}
@@ -1046,8 +1059,8 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st
}
func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
- netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
- transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+ netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
+ transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
s := netstack.Stack{stack.New(stack.Options{
NetworkProtocols: netProtos,
TransportProtocols: transProtos,
@@ -1061,22 +1074,30 @@ func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (in
})}
// Enable SACK Recovery.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
- return nil, fmt.Errorf("failed to enable SACK: %s", err)
+ {
+ opt := tcpip.TCPSACKEnabled(true)
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+ }
}
// Set default TTLs as required by socket/netstack.
- opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
- if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
- return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
- }
- if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
- return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+ {
+ opt := tcpip.DefaultTTLOption(netstack.DefaultTTL)
+ if err := s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv4.ProtocolNumber, opt, opt, err)
+ }
+ if err := s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetNetworkProtocolOption(%d, &%T(%d)): %s", ipv6.ProtocolNumber, opt, opt, err)
+ }
}
// Enable Receive Buffer Auto-Tuning.
- if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
- return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
+ {
+ opt := tcpip.TCPModerateReceiveBufferOption(true)
+ if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption(%d, &%T(%t)): %s", tcp.ProtocolNumber, opt, opt, err)
+ }
}
return &s, nil
@@ -1272,7 +1293,7 @@ func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2
return ep.tty, ep.ttyVFS2, nil
}
-func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
+func createFDTable(ctx context.Context, console bool, stdioFDs []*fd.FD) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if len(stdioFDs) != 3 {
return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 2343ce76c..bf9ec5d38 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -26,6 +26,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/control/server"
+ "gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
@@ -444,7 +445,7 @@ func TestCreateMountNamespace(t *testing.T) {
}
defer cleanup()
- mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+ mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{})
mns, err := mntr.createMountNamespace(ctx, conf)
if err != nil {
t.Fatalf("failed to create mount namespace: %v", err)
@@ -490,9 +491,9 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
}
ctx := l.k.SupervisorContext()
- mns, err := mntr.setupVFS2(ctx, l.root.conf, &l.root.procArgs)
+ mns, err := mntr.mountAll(l.root.conf, &l.root.procArgs)
if err != nil {
- t.Fatalf("failed to setupVFS2: %v", err)
+ t.Fatalf("mountAll: %v", err)
}
root := mns.Root()
@@ -702,7 +703,11 @@ func TestRestoreEnvironment(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
- mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
+ var ioFDs []*fd.FD
+ for _, ioFD := range tc.ioFDs {
+ ioFDs = append(ioFDs, fd.New(ioFD))
+ }
+ mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{})
actualRenv, err := mntr.createRestoreEnvironment(conf)
if !tc.errorExpected && err != nil {
t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 66b6cf19b..e36664938 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -21,6 +21,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
@@ -134,7 +135,7 @@ func registerFilesystems(k *kernel.Kernel) error {
}
func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
- mns, err := mntr.setupVFS2(ctx, conf, procArgs)
+ mns, err := mntr.mountAll(conf, procArgs)
if err != nil {
return fmt.Errorf("failed to setupFS: %w", err)
}
@@ -149,7 +150,7 @@ func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containe
return nil
}
-func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
+func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
log.Infof("Configuring container's file system with VFS2")
// Create context with root credentials to mount the filesystem (the current
@@ -168,34 +169,108 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, p
}
rootProcArgs.MountNamespaceVFS2 = mns
+ root := mns.Root()
+ defer root.DecRef(rootCtx)
+ if root.Mount().ReadOnly() {
+ // Switch to ReadWrite while we setup submounts.
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
+ return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
+ }
+ // Restore back to ReadOnly at the end.
+ defer func() {
+ if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
+ panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
+ }
+ }()
+ }
+
// Mount submounts.
if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
}
+
return mns, nil
}
+// createMountNamespaceVFS2 creates the container's root mount and namespace.
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
fd := c.fds.remove()
- opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
+ data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
if conf.OverlayfsStaleRead {
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
// can only send mount options for specs.Mounts (specs.Root is missing
// Options field). So assume root is always on top of overlayfs.
- opts = append(opts, "overlayfs_stale_read")
+ data = append(data, "overlayfs_stale_read")
}
log.Infof("Mounting root over 9P, ioFD: %d", fd)
- mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
- Data: strings.Join(opts, ","),
- })
+ opts := &vfs.MountOptions{
+ ReadOnly: c.root.Readonly,
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ Data: strings.Join(data, ","),
+ },
+ InternalMount: true,
+ }
+
+ fsName := gofer.Name
+ if conf.Overlay && !c.root.Readonly {
+ log.Infof("Adding overlay on top of root")
+ var err error
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting root with overlay: %w", err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
if err != nil {
return nil, fmt.Errorf("setting up mount namespace: %w", err)
}
return mns, nil
}
+// configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
+// layer using tmpfs, and return overlay mount options. "cleanup" must be called
+// after the options have been used to mount the overlay, to release refs on
+// lower and upper mounts.
+func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
+ // First copy options from lower layer to upper layer and overlay. Clear
+ // filesystem specific options.
+ upperOpts := *lowerOpts
+ upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ overlayOpts := *lowerOpts
+ overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
+
+ // Next mount upper and lower. Upper is a tmpfs mount to keep all
+ // modifications inside the sandbox.
+ upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+ }
+ cu := cleanup.Make(func() { upper.DecRef(ctx) })
+ defer cu.Clean()
+
+ // All writes go to the upper layer, be paranoid and make lower readonly.
+ lowerOpts.ReadOnly = true
+ lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
+ if err != nil {
+ return nil, nil, err
+ }
+ cu.Add(func() { lower.DecRef(ctx) })
+
+ // Configure overlay with both layers.
+ overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
+ UpperRoot: vfs.MakeVirtualDentry(upper, upper.Root()),
+ LowerRoots: []vfs.VirtualDentry{vfs.MakeVirtualDentry(lower, lower.Root())},
+ }
+ return &overlayOpts, cu.Release(), nil
+}
+
func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
mounts, err := c.prepareMountsVFS2()
if err != nil {
@@ -225,8 +300,9 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.
if mnt != nil && mnt.ReadOnly() {
// Switch to ReadWrite while we setup submounts.
if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
- return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err)
+ return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.Destination, err)
}
+ // Restore back to ReadOnly at the end.
defer func() {
if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
@@ -276,14 +352,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
}
func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
- root := mns.Root()
- defer root.DecRef(ctx)
- target := &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(submount.Destination),
- }
- fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
if err != nil {
return nil, fmt.Errorf("mountOptions failed: %w", err)
}
@@ -292,8 +361,27 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
return nil, nil
}
- if err := c.k.VFS().MakeSyntheticMountpoint(ctx, submount.Destination, root, creds); err != nil {
- return nil, err
+ if err := c.makeMountPoint(ctx, creds, mns, submount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", submount.Destination, err)
+ }
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of mount %q", submount.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(submount.Destination),
}
mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
if err != nil {
@@ -305,8 +393,9 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
// used for mounts.
-func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
+func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
fsName := m.Type
+ useOverlay := false
var data []string
// Find filesystem name and FS specific data field.
@@ -321,7 +410,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
var err error
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
if err != nil {
- return "", nil, err
+ return "", nil, false, err
}
case bind:
@@ -329,13 +418,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
if m.fd == 0 {
// Check that an FD was provided to fails fast. Technically FD=0 is valid,
// but unlikely to be correct in this context.
- return "", nil, fmt.Errorf("9P mount requires a connection FD")
+ return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
}
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
default:
log.Warningf("ignoring unknown filesystem type %q", m.Type)
- return "", nil, nil
+ return "", nil, false, nil
}
opts := &vfs.MountOptions{
@@ -360,11 +452,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mo
}
}
- if conf.Overlay {
- // All writes go to upper, be paranoid and make lower readonly.
- opts.ReadOnly = true
- }
- return fsName, opts, nil
+ return fsName, opts, useOverlay, nil
}
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
@@ -467,13 +555,25 @@ func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *conf
// Map mount type to filesystem name, and parse out the options that we are
// capable of dealing with.
mntFD := &mountAndFD{Mount: hint.mount}
- fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
+ fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
if err != nil {
return nil, err
}
if len(fsName) == 0 {
return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
}
+
+ if useOverlay {
+ log.Infof("Adding overlay on top of shared mount %q", mntFD.Destination)
+ var cleanup func()
+ opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
+ if err != nil {
+ return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.Destination, err)
+ }
+ defer cleanup()
+ fsName = overlay.Name
+ }
+
return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
}
@@ -484,7 +584,9 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
return nil, err
}
- _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
+ // Ignore data and useOverlay because these were already applied to
+ // the master mount.
+ _, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
if err != nil {
return nil, err
}
@@ -496,18 +598,39 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
root := mns.Root()
defer root.DecRef(ctx)
- if err := c.k.VFS().MakeSyntheticMountpoint(ctx, mount.Destination, root, creds); err != nil {
- return nil, err
- }
-
target := &vfs.PathOperation{
Root: root,
Start: root,
Path: fspath.Parse(mount.Destination),
}
+
+ if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
+ return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
+ }
+
if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
return nil, err
}
log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
return newMnt, nil
}
+
+func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
+ root := mns.Root()
+ defer root.DecRef(ctx)
+ target := &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dest),
+ }
+ // First check if mount point exists. When overlay is enabled, gofer doesn't
+ // allow changes to the FS, making MakeSytheticMountpoint() ineffective
+ // because MkdirAt fails with EROFS even if file exists.
+ vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
+ if err == nil {
+ // File exists, we're done.
+ vd.DecRef(ctx)
+ return nil
+ }
+ return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
+}